diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..36b421ec6f1359114ea0035d21048ab0b95bf30e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=opencl +SRCDIR_OBJS=main.o io.o #compute_gold.o +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 +KERNEL_OBJS=kernel_offline.nvptx.s diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..d2eff0bb89fc7a40e072dd2b151222e8b47afad0 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl @@ -0,0 +1,36 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Kernel of dense matrix-matrix multiplication kernel. + */ + +__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) +{ + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; + C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; + C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; + C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl new file mode 100644 index 0000000000000000000000000000000000000000..d2eff0bb89fc7a40e072dd2b151222e8b47afad0 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl @@ -0,0 +1,36 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Kernel of dense matrix-matrix multiplication kernel. + */ + +__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) +{ + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; + C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; + C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; + C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..0fc71d7fcf4c6addb9f0d30c9b668e6b3f01cd52 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc @@ -0,0 +1,218 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <vector> +#include <iostream> +#include <CL/cl.h> +#include <parboil.h> + +// I/O routines +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern char* readFile(const char*); + +// Parameters of tile sizes +#define TILE_SZ 16 + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ + std::cout<<"Line: "<<__LINE__<<"\n"; \ + exit(1); \ + } + +void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) +{ + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m%TILE_SZ) || (n%TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ + << "; n should be multiple of " << TILE_SZ << std::endl; + } + + size_t db[2] = {TILE_SZ/4,TILE_SZ}; + size_t dg[2] = {m/4,n}; + + cl_int clStatus; + + clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); + clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); + clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); + clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); + clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); + clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); + clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); + clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); + clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + CHECK_ERROR("clSetKernelArg") + + clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") +} + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; + + pb_InitializeTimerSet(&timers); + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] == NULL) + || (params->inpFiles[2] == NULL) + || (params->inpFiles[3] != NULL)) + { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; + cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + // const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")}; + // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + //CHECK_ERROR("clCreateProgramWithSource") + + //char clOptions[50]; + //sprintf(clOptions,""); + + //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + //CHECK_ERROR("clBuildProgram") + + //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + //CHECK_ERROR("clCreateKernel") + + /* Read in data */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + + // load A + readColMajorMatrixFile(params->inpFiles[0], + matArow, matAcol, matA); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], + matBcol, matBrow, matBT); + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + A_sz = matArow*matAcol*sizeof(float); + B_sz = matBrow*matBcol*sizeof(float); + + // allocate space for C + C_sz = matArow*matBcol*sizeof(float); + + // OpenCL memory allocation + std::vector<float> matC(matArow*matBcol); + + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + // Copy A and B^T into device memory + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + for(int i=0;i<matC.size();i++) + matC[i] = 0.0f; + + clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ + dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + + if (params->outFile) { + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + + /* Write C to file */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, + matArow, matBcol, matC); + } + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + + //free((void*)clSource[0]); + + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseMemObject(dA); + clStatus = clReleaseMemObject(dB); + clStatus = clReleaseMemObject(dC); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc index 141ea8da1627548cdb38262931e8a3b8bfd0d086..04b6579d254bf6648d50870724558a5ce7773bca 100644 --- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc @@ -65,6 +65,7 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto v.push_back(data); } v.pop_back(); // remove the duplicated last element + return true; } diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f74ee8921a534b6963ba06d089398114571d070b --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..71a615026f979a70ffb7d99341e3e5a1ba23e8b2 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc @@ -0,0 +1,177 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <vector> +#include <iostream> +#include <parboil.h> +#include <visc.h> + +// I/O routines +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern char* readFile(const char*); + +// Parameters of tile sizes +#define TILE_SZ 16 + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + std::cout<<errorMessage<<" Error!\n"; \ + std::cout<<"Line: "<<__LINE__<<"\n"; \ + exit(1); \ + } + +void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) +{ + __visc__attributes(3, A, B, C, 1, C); + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; + C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; + C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; + C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +} + +__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) +{ + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m%TILE_SZ) || (n%TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ + << "; n should be multiple of " << TILE_SZ << std::endl; + } + + unsigned db[2] = {TILE_SZ/4,TILE_SZ}; + unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); +} + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; + + pb_InitializeTimerSet(&timers); + __visc__init(); + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] == NULL) + || (params->inpFiles[2] == NULL) + || (params->inpFiles[3] != NULL)) + { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + + /* Read in data */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + + // load A + readColMajorMatrixFile(params->inpFiles[0], + matArow, matAcol, matA); + + // load B^T + readColMajorMatrixFile(params->inpFiles[2], + matBcol, matBrow, matBT); + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + A_sz = matArow*matAcol*sizeof(float); + B_sz = matBrow*matBcol*sizeof(float); + + // allocate space for C + C_sz = matArow*matBcol*sizeof(float); + + // OpenCL memory allocation + std::vector<float> matC(matArow*matBcol); + + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + for(size_t i=0; i<matC.size(); i++) + matC[i] = 0.0f; + + pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ + &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + + if (params->outFile) { + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + + /* Write C to file */ + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, + matArow, matBcol, matC); + } + + pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + pb_PrintTimerSet(&timers); + __visc__cleanup(); + pb_FreeParameters(params); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f74ee8921a534b6963ba06d089398114571d070b --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..71a615026f979a70ffb7d99341e3e5a1ba23e8b2 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc @@ -0,0 +1,177 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <vector> +#include <iostream> +#include <parboil.h> +#include <visc.h> + +// I/O routines +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern char* readFile(const char*); + +// Parameters of tile sizes +#define TILE_SZ 16 + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + std::cout<<errorMessage<<" Error!\n"; \ + std::cout<<"Line: "<<__LINE__<<"\n"; \ + exit(1); \ + } + +void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta ) +{ + __visc__attributes(3, A, B, C, 1, C); + float c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0.0f; + int m = 4 * get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a0 = A[m + i * lda]; + float a1 = A[m + 1 + i * lda]; + float a2 = A[m + 2 + i * lda]; + float a3 = A[m + 3 + i * lda]; + + float b = B[n + i * ldb]; + + c0 += a0 * b; + c1 += a1 * b; + c2 += a2 * b; + c3 += a3 * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0; + C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1; + C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2; + C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3; +} + +__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) +{ + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m%TILE_SZ) || (n%TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ + << "; n should be multiple of " << TILE_SZ << std::endl; + } + + unsigned db[2] = {TILE_SZ/4,TILE_SZ}; + unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); + __visc__wait(sgemmDFG); +} + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; + + pb_InitializeTimerSet(&timers); + __visc__init(); + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] == NULL) + || (params->inpFiles[2] == NULL) + || (params->inpFiles[3] != NULL)) + { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + + /* Read in data */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + + // load A + readColMajorMatrixFile(params->inpFiles[0], + matArow, matAcol, matA); + + // load B^T + readColMajorMatrixFile(params->inpFiles[2], + matBcol, matBrow, matBT); + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + A_sz = matArow*matAcol*sizeof(float); + B_sz = matBrow*matBcol*sizeof(float); + + // allocate space for C + C_sz = matArow*matBcol*sizeof(float); + + // OpenCL memory allocation + std::vector<float> matC(matArow*matBcol); + + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); + // Copy A and B^T into device memory + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + + for(size_t i=0; i<matC.size(); i++) + matC[i] = 0.0f; + + pb_SwitchToTimer( &timers, pb_TimerID_NONE ); + + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ + &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow); + + if (params->outFile) { + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + + /* Write C to file */ + llvm_visc_request_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, + matArow, matBcol, matC); + } + + pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK ); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + pb_PrintTimerSet(&timers); + __visc__cleanup(); + pb_FreeParameters(params); + + return 0; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll new file mode 100644 index 0000000000000000000000000000000000000000..ea1e7b3b7cc4092f69dd0de9b33ad9b693bcac1c --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll @@ -0,0 +1,894 @@ +; ModuleID = 'build/visc_tc_vec_default/main.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } +%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } +%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } +%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } +%"struct.std::ios_base::_Words" = type { i8*, i64 } +%"class.std::locale" = type { %"class.std::locale::_Impl"* } +%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } +%"class.std::locale::facet" = type { i32 (...)**, i32 } +%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } +%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } +%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } +%struct.__locale_data = type opaque +%"class.std::num_put" = type { %"class.std::locale::facet" } +%"class.std::num_get" = type { %"class.std::locale::facet" } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%rtype = type {} +%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } +%struct.pb_Parameters = type { i8*, i8** } + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@_ZSt4cerr = external global %"class.std::basic_ostream" +@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 +@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 +@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 +@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 +@stderr = external global %struct._IO_FILE* +@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 +@_ZSt4cout = external global %"class.std::basic_ostream" +@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 +@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] +@viscTimerSet_GenVISC = common global i8* null +@0 = internal constant [14 x i8] c"GenVISC_Timer\00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 + +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() + %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %2 = mul i32 %0, %1 + %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %4 = add i32 %2, %3 + %mul = shl nsw i32 %4, 2 + %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %7 = mul i32 %5, %6 + %8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %9 = add i32 %7, %8 + %cmp147 = icmp sgt i32 %k, 0 + %add3144 = or i32 %mul, 1 + %add8145 = or i32 %mul, 2 + %add13146 = or i32 %mul, 3 + + %mul.tmp1 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %mul, i32 0 + %mul.tmp2 = insertelement <4 x i32> %mul.tmp1, i32 %add3144, i32 1 + %mul.tmp3 = insertelement <4 x i32> %mul.tmp2, i32 %add8145, i32 2 + %mul.vector = insertelement <4 x i32> %mul.tmp2, i32 %add13146, i32 3 + + %lda.tmp = insertelement <1 x i32> < i32 0 >, i32 %lda, i32 0 + %lda.vector = shufflevector <1 x i32> %lda.tmp, <1 x i32> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > + + br i1 %cmp147, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + +; %c0.0152 = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] +; %c1.0151 = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] +; %c2.0150 = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] +; %c3.0149 = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] + %c.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] + + %10 = trunc i64 %indvars.iv to i32 + %mul2 = mul nsw i32 %10, %lda + +; %add = add nsw i32 %mul2, %mul +; %idxprom = sext i32 %add to i64 +; %arrayidx = getelementptr inbounds float* %A, i64 %idxprom +; %11 = load float* %arrayidx, align 4, !tbaa !0 +; %add5 = add nsw i32 %mul2, %add3144 +; %idxprom6 = sext i32 %add5 to i64 +; %arrayidx7 = getelementptr inbounds float* %A, i64 %idxprom6 +; %12 = load float* %arrayidx7, align 4, !tbaa !0 +; %add10 = add nsw i32 %mul2, %add8145 +; %idxprom11 = sext i32 %add10 to i64 +; %arrayidx12 = getelementptr inbounds float* %A, i64 %idxprom11 +; %13 = load float* %arrayidx12, align 4, !tbaa !0 +; %add15 = add nsw i32 %mul2, %add13146 +; %idxprom16 = sext i32 %add15 to i64 +; %arrayidx17 = getelementptr inbounds float* %A, i64 %idxprom16 +; %14 = load float* %arrayidx17, align 4, !tbaa !0 + %add = add nsw i32 %mul2, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float* %A, i64 %idxprom + %arrayidx.cast = bitcast float* %arrayidx to <4 x float>* + %11 = load <4 x float>* %arrayidx.cast, align 4 + + %mul18 = mul nsw i32 %10, %ldb + %add19 = add nsw i32 %mul18, %9 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds float* %B, i64 %idxprom20 +; %15 = load float* %arrayidx21, align 4, !tbaa !0 + %12 = load float* %arrayidx21, align 4, !tbaa !0 + + %b.tmp = insertelement <1 x float> < float 0.000000e+00 >, float %12, i32 0 + %b.vector = shufflevector <1 x float> %b.tmp, <1 x float> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > + +; %mul22 = fmul fast float %11, %15 +; %add23 = fadd fast float %c0.0152, %mul22 +; %mul24 = fmul fast float %12, %15 +; %add25 = fadd fast float %c1.0151, %mul24 +; %mul26 = fmul fast float %13, %15 +; %add27 = fadd fast float %c2.0150, %mul26 +; %mul28 = fmul fast float %14, %15 +; %add29 = fadd fast float %c3.0149, %mul28 + %mul22 = fmul fast <4 x float> %11, %b.vector + %add23 = fadd fast <4 x float> %c.vector, %mul22 + + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry +; %c0.0.lcssa = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] +; %c1.0.lcssa = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] +; %c2.0.lcssa = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] +; %c3.0.lcssa = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] + %c.end.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] + + %c0.0.lcssa = extractelement <4 x float> %c.end.vector, i32 0 + %c1.0.lcssa = extractelement <4 x float> %c.end.vector, i32 1 + %c2.0.lcssa = extractelement <4 x float> %c.end.vector, i32 2 + %c3.0.lcssa = extractelement <4 x float> %c.end.vector, i32 3 + + %mul30 = mul nsw i32 %9, %ldc + %add31 = add nsw i32 %mul30, %mul + %idxprom32 = sext i32 %add31 to i64 + %arrayidx33 = getelementptr inbounds float* %C, i64 %idxprom32 + +; %16 = load float* %arrayidx33, align 4, !tbaa !0 +; %mul34 = fmul fast float %16, %beta + %13 = load float* %arrayidx33, align 4, !tbaa !0 + %mul34 = fmul fast float %13, %beta + + %mul35 = fmul fast float %c0.0.lcssa, %alpha + %add36 = fadd fast float %mul35, %mul34 + store float %add36, float* %arrayidx33, align 4, !tbaa !0 + %add43 = add nsw i32 %add3144, %mul30 + %idxprom44 = sext i32 %add43 to i64 + %arrayidx45 = getelementptr inbounds float* %C, i64 %idxprom44 + +; %17 = load float* %arrayidx45, align 4, !tbaa !0 +; %mul46 = fmul fast float %17, %beta + %14 = load float* %arrayidx45, align 4, !tbaa !0 + %mul46 = fmul fast float %14, %beta + + %mul47 = fmul fast float %c1.0.lcssa, %alpha + %add48 = fadd fast float %mul47, %mul46 + store float %add48, float* %arrayidx45, align 4, !tbaa !0 + %add56 = add nsw i32 %add8145, %mul30 + %idxprom57 = sext i32 %add56 to i64 + %arrayidx58 = getelementptr inbounds float* %C, i64 %idxprom57 + +; %18 = load float* %arrayidx58, align 4, !tbaa !0 +; %mul59 = fmul fast float %18, %beta + %15 = load float* %arrayidx58, align 4, !tbaa !0 + %mul59 = fmul fast float %15, %beta + + %mul60 = fmul fast float %c2.0.lcssa, %alpha + %add61 = fadd fast float %mul60, %mul59 + store float %add61, float* %arrayidx58, align 4, !tbaa !0 + %add69 = add nsw i32 %add13146, %mul30 + %idxprom70 = sext i32 %add69 to i64 + %arrayidx71 = getelementptr inbounds float* %C, i64 %idxprom70 + +; %19 = load float* %arrayidx71, align 4, !tbaa !0 +; %mul72 = fmul fast float %19, %beta + %16 = load float* %arrayidx71, align 4, !tbaa !0 + %mul72 = fmul fast float %16, %beta + + %mul73 = fmul fast float %c3.0.lcssa, %alpha + %add74 = fadd fast float %mul73, %mul72 + store float %add74, float* %arrayidx71, align 4, !tbaa !0 + ret %rtype undef +} + +; Function Attrs: noinline nounwind uwtable +define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { +entry: + switch i8 %transa, label %if.then [ + i8 78, label %if.end + i8 110, label %if.end + ] + +if.then: ; preds = %entry + %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 + %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %0 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %0, align 8 + %add.ptr.i.sum = add i64 %vbase.offset.i, 240 + %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum + %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** + %2 = load %"class.std::ctype"** %1, align 8, !tbaa !4 + %tobool.i97 = icmp eq %"class.std::ctype"* %2, null + br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + +if.then.i98: ; preds = %if.then + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then + %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 + %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !1 + %tobool.i = icmp eq i8 %3, 0 + br i1 %tobool.i, label %if.end.i, label %if.then.i + +if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 + %4 = load i8* %arrayidx.i, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 + %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !3 + %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6 + %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 + %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i + %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ] + %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 + %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1 + br label %return + +if.end: ; preds = %entry, %entry + switch i8 %transb, label %if.then9 [ + i8 84, label %if.end12 + i8 116, label %if.end12 + ] + +if.then9: ; preds = %if.end + %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 + %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 + %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24 + %7 = bitcast i8* %vbase.offset.ptr.i52 to i64* + %vbase.offset.i53 = load i64* %7, align 8 + %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240 + %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum + %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"** + %9 = load %"class.std::ctype"** %8, align 8, !tbaa !4 + %tobool.i100 = icmp eq %"class.std::ctype"* %9, null + br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + +if.then.i101: ; preds = %if.then9 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9 + %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 + %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !1 + %tobool.i76 = icmp eq i8 %10, 0 + br i1 %tobool.i76, label %if.end.i82, label %if.then.i78 + +if.then.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 + %11 = load i8* %arrayidx.i77, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +if.end.i82: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 + %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !3 + %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6 + %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8 + %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +_ZNKSt5ctypeIcE5widenEc.exit84: ; preds = %if.end.i82, %if.then.i78 + %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ] + %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1 + %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1 + br label %return + +if.end12: ; preds = %if.end, %if.end + %rem44 = and i32 %m, 15 + %tobool = icmp eq i32 %rem44, 0 + br i1 %tobool, label %lor.lhs.false, label %if.then15 + +lor.lhs.false: ; preds = %if.end12 + %rem1345 = and i32 %n, 15 + %tobool14 = icmp eq i32 %rem1345, 0 + br i1 %tobool14, label %if.end21, label %if.then15 + +if.then15: ; preds = %lor.lhs.false, %if.end12 + %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 + %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1 + %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 + %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1 + %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** + %vtable.i63 = load i8** %14, align 8, !tbaa !3 + %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24 + %15 = bitcast i8* %vbase.offset.ptr.i64 to i64* + %vbase.offset.i65 = load i64* %15, align 8 + %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* + %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240 + %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum + %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"** + %18 = load %"class.std::ctype"** %17, align 8, !tbaa !4 + %tobool.i104 = icmp eq %"class.std::ctype"* %18, null + br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + +if.then.i105: ; preds = %if.then15 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15 + %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 + %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !1 + %tobool.i88 = icmp eq i8 %19, 0 + br i1 %tobool.i88, label %if.end.i94, label %if.then.i90 + +if.then.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 + %20 = load i8* %arrayidx.i89, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +if.end.i94: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 + %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !3 + %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6 + %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8 + %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +_ZNKSt5ctypeIcE5widenEc.exit96: ; preds = %if.end.i94, %if.then.i90 + %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ] + %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1 + %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1 + br label %if.end21 + +if.end21: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false + %div = sdiv i32 %m, 16 + %mul = and i32 %div, 1073741823 + %div22 = sdiv i32 %n, 16 + %mul24 = and i32 %div22, 268435455 + %conv33 = fpext float %alpha to double + %conv34 = fpext float %beta to double + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) + %in.addr = alloca %struct.arg + %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + store float* %A, float** %in.addr.A + %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + store i64 %bytesA, i64* %in.addr.bytes_A + %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + store i32 %lda, i32* %in.addr.lda + %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + store float* %B, float** %in.addr.B + %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + store i64 %bytesB, i64* %in.addr.bytes_B + %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + store i32 %ldb, i32* %in.addr.ldb + %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + store float* %C, float** %in.addr.C + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + store i64 %bytesC, i64* %in.addr.bytes_C + %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store i32 %ldc, i32* %in.addr.ldc + %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + store i32 %k, i32* %in.addr.k + %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 + %in.addr.alpha.cast = fptrunc double %conv33 to float + store float %in.addr.alpha.cast, float* %in.addr.alpha + %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 + %in.addr.beta.cast = fptrunc double %conv34 to float + store float %in.addr.beta.cast, float* %in.addr.beta + %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 + store i32 4, i32* %in.addr.dimX0 + %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 + store i32 16, i32* %in.addr.dimY0 + %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 + store i32 %mul, i32* %in.addr.dimX1 + %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 + store i32 %mul24, i32* %in.addr.dimY1 + %args = bitcast %struct.arg* %in.addr to i8* + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) + call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) + br label %return + +return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit + ret void +} + +declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %timers = alloca %struct.pb_TimerSet, align 8 + %matArow = alloca i32, align 4 + %matAcol = alloca i32, align 4 + %matBrow = alloca i32, align 4 + %matBcol = alloca i32, align 4 + %matA = alloca %"class.std::vector", align 8 + %matBT = alloca %"class.std::vector", align 8 + %matC = alloca %"class.std::vector", align 8 + store i32 %argc, i32* %argc.addr, align 4, !tbaa !5 + %0 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 800, i8* %0) #1 + %1 = bitcast %"class.std::vector"* %matA to i8* + call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 + %2 = bitcast %"class.std::vector"* %matBT to i8* + call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 + %3 = call i8* @llvm_visc_initializeTimerSet() + store i8* %3, i8** @viscTimerSet_GenVISC + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + call void @llvm.visc.init() + %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 + %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 + %4 = load i8*** %inpFiles, align 8, !tbaa !4 + %5 = load i8** %4, align 8, !tbaa !4 + %cmp = icmp eq i8* %5, null + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %arrayidx2 = getelementptr inbounds i8** %4, i64 1 + %6 = load i8** %arrayidx2, align 8, !tbaa !4 + %cmp3 = icmp eq i8* %6, null + br i1 %cmp3, label %if.then, label %lor.lhs.false4 + +lor.lhs.false4: ; preds = %lor.lhs.false + %arrayidx6 = getelementptr inbounds i8** %4, i64 2 + %7 = load i8** %arrayidx6, align 8, !tbaa !4 + %cmp7 = icmp eq i8* %7, null + br i1 %cmp7, label %if.then, label %lor.lhs.false8 + +lor.lhs.false8: ; preds = %lor.lhs.false4 + %arrayidx10 = getelementptr inbounds i8** %4, i64 3 + %8 = load i8** %arrayidx10, align 8, !tbaa !4 + %cmp11 = icmp eq i8* %8, null + br i1 %cmp11, label %if.end, label %if.then + +if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry + %9 = load %struct._IO_FILE** @stderr, align 8, !tbaa !4 + %10 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %9) + call void @exit(i32 -1) #7 + unreachable + +if.end: ; preds = %lor.lhs.false8 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 + %11 = load i8*** %inpFiles, align 8, !tbaa !4 + %12 = load i8** %11, align 8, !tbaa !4 + %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %12, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 + %13 = load i8*** %inpFiles, align 8, !tbaa !4 + %arrayidx17 = getelementptr inbounds i8** %13, i64 2 + %14 = load i8** %arrayidx17, align 8, !tbaa !4 + %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %14, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %15 = load i32* %matArow, align 4, !tbaa !5 + %16 = load i32* %matAcol, align 4, !tbaa !5 + %mul = mul nsw i32 %16, %15 + %conv = sext i32 %mul to i64 + %mul19 = shl nsw i64 %conv, 2 + %17 = load i32* %matBrow, align 4, !tbaa !5 + %18 = load i32* %matBcol, align 4, !tbaa !5 + %mul20 = mul nsw i32 %18, %17 + %conv21 = sext i32 %mul20 to i64 + %mul22 = shl nsw i64 %conv21, 2 + %mul23 = mul nsw i32 %18, %15 + %conv24 = sext i32 %mul23 to i64 + %mul25 = shl nsw i64 %conv24, 2 + %19 = bitcast %"class.std::vector"* %matC to i8* + call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 24, i32 8, i1 false) #1 + %cmp.i.i.i.i = icmp eq i32 %mul23, 0 + br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i + +cond.true.i.i.i.i: ; preds = %if.end + %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6 + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i + call void @_ZSt17__throw_bad_allocv() #7 + unreachable + +_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i + %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 + %20 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end + %cond.i.i.i.i = phi float* [ %20, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] + %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 + store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !4 + %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 + store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 + %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 + %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 + store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !4 + br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i + +for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 + %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i + %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 + %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i + br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] + %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i + %21 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %21, align 4 + %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 + %22 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i + %23 = bitcast float* %22 to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %23, align 4 + %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 + %24 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i + br i1 %24, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 + br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader + +for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* + %25 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 + call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %25, i32 4, i1 false) + br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + +_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 + %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 + %26 = load float** %_M_start.i.i, align 8, !tbaa !4 + %27 = bitcast float* %26 to i8* + call void @llvm_visc_track_mem(i8* %27, i64 %mul19) #1 + %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 + %28 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %29 = bitcast float* %28 to i8* + call void @llvm_visc_track_mem(i8* %29, i64 %mul22) #1 + %30 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %31 = bitcast float* %30 to i8* + call void @llvm_visc_track_mem(i8* %31, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %32 = load float** %_M_finish.i.i.i, align 8, !tbaa !4 + %33 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %cmp3399 = icmp eq float* %32, %33 + br i1 %cmp3399, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + %sub.ptr.lhs.cast.i = ptrtoint float* %32 to i64 + %sub.ptr.rhs.cast.i = ptrtoint float* %33 to i64 + %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %add.ptr.i = getelementptr inbounds float* %33, i64 %i.0100 + store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !0 + %inc = add i64 %i.0100, 1 + %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i + br i1 %cmp33, label %for.body, label %for.end + +for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %34 = load i32* %matArow, align 4, !tbaa !5 + %35 = load i32* %matBcol, align 4, !tbaa !5 + %36 = load i32* %matAcol, align 4, !tbaa !5 + %37 = load float** %_M_start.i.i, align 8, !tbaa !4 + %38 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %39 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %34, i32 %35, i32 %36, float 1.000000e+00, float* %37, i64 %mul19, i32 %34, float* %38, i64 %mul22, i32 %35, float 0.000000e+00, float* %39, i64 %mul25, i32 %34) + %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 + %40 = load i8** %outFile, align 8, !tbaa !4 + %tobool = icmp eq i8* %40, null + br i1 %tobool, label %if.end42, label %if.then38 + +if.then38: ; preds = %for.end + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 + %41 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %42 = bitcast float* %41 to i8* + call void @llvm_visc_request_mem(i8* %42, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 + %43 = load i8** %outFile, align 8, !tbaa !4 + %44 = load i32* %matArow, align 4, !tbaa !5 + %45 = load i32* %matBcol, align 4, !tbaa !5 + %call41 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %43, i32 %44, i32 %45, %"class.std::vector"* %matC) #1 + br label %if.end42 + +if.end42: ; preds = %if.then38, %for.end + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 + %46 = load float** %_M_start.i.i, align 8, !tbaa !4 + %47 = bitcast float* %46 to i8* + call void @llvm_visc_untrack_mem(i8* %47) #1 + %48 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %49 = bitcast float* %48 to i8* + call void @llvm_visc_untrack_mem(i8* %49) #1 + %50 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %51 = bitcast float* %50 to i8* + call void @llvm_visc_untrack_mem(i8* %51) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 + %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 + %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 + %52 = load i32* %matArow, align 4, !tbaa !5 + %conv50 = sitofp i32 %52 to double + %mul51 = fmul fast double %conv50, 2.000000e+00 + %53 = load i32* %matBcol, align 4, !tbaa !5 + %conv52 = sitofp i32 %53 to double + %mul53 = fmul fast double %mul51, %conv52 + %54 = load i32* %matAcol, align 4, !tbaa !5 + %conv54 = sitofp i32 %54 to double + %mul55 = fmul fast double %mul53, %conv54 + %div = fdiv fast double %mul55, %call48 + %div56 = fmul double %div, 1.000000e-09 + %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 + %55 = bitcast %"class.std::basic_ostream"* %call.i to i8** + %vtable.i = load i8** %55, align 8, !tbaa !3 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %56 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %56, align 8 + %57 = bitcast %"class.std::basic_ostream"* %call.i to i8* + %add.ptr.sum.i = add i64 %vbase.offset.i, 240 + %_M_ctype.i.i = getelementptr inbounds i8* %57, i64 %add.ptr.sum.i + %58 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** + %59 = load %"class.std::ctype"** %58, align 8, !tbaa !4 + %tobool.i.i.i = icmp eq %"class.std::ctype"* %59, null + br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + +if.then.i.i.i: ; preds = %if.end42 + call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end42 + %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 6 + %60 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !1 + %tobool.i3.i.i = icmp eq i8 %60, 0 + br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i + +if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 7, i64 10 + %61 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %59) #1 + %62 = bitcast %"class.std::ctype"* %59 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %62, align 8, !tbaa !3 + %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 + %63 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 + %call.i.i.i = call signext i8 %63(%"class.std::ctype"* %59, i8 signext 10) #1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i + %retval.0.i.i.i = phi i8 [ %61, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] + %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 + %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 + %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 + call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) + call void @llvm.visc.cleanup() + call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 + %64 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %tobool.i.i.i.i78 = icmp eq float* %64, null + br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 + +if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %65 = bitcast float* %64 to i8* + call void @_ZdlPv(i8* %65) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 + +_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %66 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %tobool.i.i.i.i74 = icmp eq float* %66, null + br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 + +if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %67 = bitcast float* %66 to i8* + call void @_ZdlPv(i8* %67) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 + +_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %68 = load float** %_M_start.i.i, align 8, !tbaa !4 + %tobool.i.i.i.i = icmp eq float* %68, null + br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i + +if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 + %69 = bitcast float* %68 to i8* + call void @_ZdlPv(i8* %69) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit + +_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 + call void @llvm.lifetime.end(i64 800, i8* %0) #1 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 + +; Function Attrs: noreturn nounwind +declare void @exit(i32) #4 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 + +declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 + +declare void @llvm_visc_track_mem(i8*, i64) #0 + +declare void @llvm_visc_request_mem(i8*, i64) #0 + +declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 + +declare void @llvm_visc_untrack_mem(i8*) #0 + +declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 + +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() #5 + +declare noalias i8* @_Znwm(i64) #0 + +; Function Attrs: nounwind +declare void @_ZdlPv(i8*) #6 + +declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 + +declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 + +; Function Attrs: noreturn +declare void @_ZSt16__throw_bad_castv() #5 + +declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 + +declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 + +; Function Attrs: nounwind +define internal void @_GLOBAL__I_a() #1 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 + ret void +} + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +declare i8* @llvm_visc_initializeTimerSet() + +declare void @llvm_visc_switchToTimer(i8**, i32) + +declare void @llvm_visc_printTimerSet(i8**, i8*) + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.init() #1 + +; Function Attrs: nounwind +declare void @llvm.visc.cleanup() #1 + +attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { noreturn nounwind } + +!0 = metadata !{metadata !"float", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !"vtable pointer", metadata !2} +!4 = metadata !{metadata !"any pointer", metadata !1} +!5 = metadata !{metadata !"int", metadata !1} +!6 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/llvm/test/VISC/parboil/driver/options.py b/llvm/test/VISC/parboil/driver/options.py index 1b8ef5f79fd896011651395d72fee534908fe5e5..bdf4c45212f273634bc5f67fbf077266b5494e07 100644 --- a/llvm/test/VISC/parboil/driver/options.py +++ b/llvm/test/VISC/parboil/driver/options.py @@ -355,7 +355,7 @@ def time_options(progname, cmd, args): 'TEST' : [("small", 20), ("medium", 20), ("large", 10)] } ) - ,('sgemm', { 'VERSION' : ["visc", "opencl_base"], + ,('sgemm', { 'VERSION' : ["visc", "visc_tc", "visc_tc_vec", "opencl_base", "opencl_base_tc", "opencl_nvidia"], 'TEST' : [("small", 20), ("medium", 10)] } ) @@ -396,7 +396,8 @@ def time_options(progname, cmd, args): arguments.append("-v") print "Cleaning " + app + " " + ver - action = clean_options(progname, 'clean', arguments).run() + #action = clean_options(progname, 'clean', arguments).run() + action = False if action: action() else: diff --git a/llvm/test/VISC/parboil/parboilParser.py b/llvm/test/VISC/parboil/parboilParser.py index 58813a6d43e8e67099f84665fb4902e0aff77c17..0d1f10b6862c15fb8f591972dea8f13dfba45e30 100755 --- a/llvm/test/VISC/parboil/parboilParser.py +++ b/llvm/test/VISC/parboil/parboilParser.py @@ -94,71 +94,60 @@ def parseCSVFile(filename): # return dict return csvDict -# returns the name of the opecl version of the given application -def getOpenCLVersionName(app): - return 'opencl_base' if app=='stencil' or app=='sgemm' else 'opencl_nvidia' # returns a list of available tests for the given application -# the tests are found based on the opecl version, because the visc -# might not exist in the dict +# the tests are found based on the visc version, because it exists +# for all apps in the dict def getTests(app, csvDict): - v = getOpenCLVersionName(app) - return csvDict[app][v].keys() + return csvDict[app]["visc"].keys() + + +def isViscVersion(version): + return version.startswith("visc") + + +def getAllVersions(csvDict): + apps = csvDict.keys() + + versions = set() + for app in apps: + versions = versions | set(csvDict[app].keys()) + + versionsList = sorted(list(versions)) + return versionsList + # print total execution time for all applications and corresponding # test sizes def printTotalExecutionTimeTable(csvDict): - print 'application,visc,opencl' + versions = getAllVersions(csvDict) + + line = "application" + for v in versions: + line = line + "," + v + print line + t = 'Final' cat = 'Timer Wall - IO' for app in csvDict.iterkeys(): - v1 = 'visc' - v2 = getOpenCLVersionName(app) tests = getTests(app, csvDict) for test in tests: - print "{0:s}-{1:s},{2:s},{3:s}".format(app, test, csvDict[app][v1][test][t][cat], csvDict[app][v2][test][t][cat]) + line = "{0:s}-{1:s}".format(app, test) + for v in versions: + line = line + "," + csvDict[app][v][test][t][cat] + print line -def plotTotalExecutionTimeTable(csvDict): - global num_figs - t1 = 'GenVISC_Timer' - t2 = 'Final' - cat = 'Timer Wall Time' - visc_time = [] - opencl_time = [] - axis_values = [] - n_groups = 0; - for app in csvDict.iterkeys(): - v1 = 'visc' - v2 = getOpenCLVersionName(app) - tests = getTests(app, csvDict) - for test in tests: - n_groups += 1 - visc_time.append(float(csvDict[app][v1][test][t1][cat])) - opencl_time.append(float(csvDict[app][v2][test][t2][cat])) - axis_values.append(app + '\n' + test); - - bar_width = 0.35 - opacity = 0.4 - num_figs += 1 - fig, ax = plt.subplots(num=num_figs) - index = np.arange(n_groups) - bars_visc = plt.bar(index, visc_time, bar_width, alpha=opacity, color='b', label='VISC') - bars_opencl = plt.bar(index + bar_width, opencl_time, bar_width, alpha=opacity, color='r', label='OpenCL') - plt.xlabel('Experiments') - plt.ylabel('Total Execution Time (s)') - plt.title('Total Execution Time - VISC and OpenCL') - plt.xticks(index + bar_width, axis_values) - plt.legend(loc='best') - plt.tight_layout() - -def printTimerDecomposition(csvDict, isVisc): + +def printTimerDecomposition(csvDict, version): # get apps apps = csvDict.keys() + isVisc = isViscVersion(version) + # get tests for each app tests = dict() for app in apps: - tests[app] = getTests(app, csvDict) + tests[app] = csvDict[app][version].keys() # list of timer-category pairs if isVisc: @@ -206,133 +195,26 @@ def printTimerDecomposition(csvDict, isVisc): ('Final', 'IO'), ('Final', 'Timer Wall Time')] - line = "Category," + print version + line = "Category" for app in apps: for test in tests[app]: - line = line + app + "-" + test + "," + line = line + "," + app + "-" + test print line for (t, cat) in timers: - line = cat + "," + line = cat for app in apps: - v = 'visc' if isVisc else getOpenCLVersionName(app) for test in tests[app]: - line = line + csvDict[app][v][test][t][cat] + "," - print line - -def plotTimerDecomposition(csvDict, plotapp): - global num_figs - - # get apps - if not plotapp: - apps = csvDict.keys() - else: - apps = [ plotapp ] - - # get tests for each app - tests = dict() - for app in apps: - tests[app] = getTests(app, csvDict) - - # list of timer-category pairs - visc_timers =[('Final', 'Kernel'), - ('Final', 'Load Program Binary'), - ('Final', 'Argument Unpack'), - ('Final', 'Marshal Arguments'), - ('Final', 'Free Memory'), - ('Final', 'Memory Track'), - ('Final', 'Clear Context'), - ('Final', 'Total GPU Computation'), - ('Final', 'Copy Pointer Arguments'), - ('Final', 'Initialize Context'), - ('Final', 'Read Output'), - ('Final', 'Pthread Create'), - ('Final', 'Copy Scalar Arguments'), - ('Final', 'WorkGroup Size Calculation'), - ('Final', 'IO'), - ('Final', 'Output Pack'), - ('Parboil', 'Mem_Untrack'), - ('Parboil', 'Clear_Ctx'), - ('Final', 'Timer Wall - IO'), - ('Final', 'Timer Wall Time')] - opencl_timers =[('Final', 'Init_Ctx'), - ('Final', 'Arg_Unpack'), - ('Final', 'Copy_Scalar'), - ('Final', 'Mem_Track'), - ('Final', 'Driver'), - ('Final', 'Output_Unpack'), - ('Final', 'Arg_Pack'), - ('Final', 'Copy'), - ('Final', 'Compute'), - ('Final', 'Setup'), - ('Final', 'Read_Output'), - ('Final', 'IO'), - ('Final', 'Pthread_Create'), - ('Final', 'Kernel'), - ('Final', 'Mem_Free'), - ('Final', 'Copy Async'), - ('Final', 'Copy_Ptr'), - ('Final', 'Output_Pack'), - ('Final', 'Mem_Untrack'), - ('Final', 'Clear_Ctx'), - ('Final', 'Timer Wall - IO'), - ('Final', 'Timer Wall Time')] - - for app in apps: - for test in tests[app]: -# Plotting visc timers - v = 'visc' - n_vals = 0; - axis_values = [] - visc_time_decomp = [] - for (t, cat) in visc_timers: - n_vals += 1 - axis_values.append(cat); - visc_time_decomp.append(float(csvDict[app][v][test][t][cat])) - - opacity = 0.4 - num_figs += 1 - fig = plt.figure(num=num_figs) - ax = fig.add_subplot(211); - index = np.arange(n_vals) - plt.barh(index, visc_time_decomp, alpha=opacity, color='b', label=v) - plt.xlabel('Time (s)') - plt.ylabel('Timers') - plt.title('Time Decomposition - ' + app + ' ' + test + ' ' + v) - plt.yticks(index, axis_values) - plt.tight_layout() - - v = getOpenCLVersionName(app) - n_vals = 0; - axis_values = [] - opencl_time_decomp = [] - for (t, cat) in opencl_timers: - n_vals += 1 - axis_values.append(cat); - opencl_time_decomp.append(float(csvDict[app][v][test][t][cat])) - - opacity = 0.4 - ax = fig.add_subplot(212); - index = np.arange(n_vals) - plt.barh(index, opencl_time_decomp, alpha=opacity, color='r', label=v) - plt.xlabel('Time (s)') - plt.ylabel('Timers') - plt.title('Time Decomposition - ' + app + ' ' + test + ' ' + v) - plt.yticks(index, axis_values) - plt.tight_layout() - + line = line + "," + csvDict[app][version][test][t][cat] + print line # command line options parser parser = OptionParser() parser.add_option("-f","--file",action="store",type="string",dest="filename") parser.add_option("--print-totals",action="store_true",default=False,dest="printTotals") -parser.add_option("--print-timers-visc",action="store_true",default=False,dest="printTimersVisc") -parser.add_option("--print-timers-opencl",action="store_true",default=False,dest="printTimersOpencl") +parser.add_option("--print-timers",action="store",type="string",dest="printTimers") parser.add_option("--print-all",action="store_true",default=False,dest="printAll") -parser.add_option("--plot-totals",action="store_true",default=False,dest="plotTotals") -parser.add_option("--plot-timers",action="store_true",default=False,dest="plotTimers") -parser.add_option("--plot-all",action="store_true",default=False,dest="plotAll") -parser.add_option("--plot-app",action="store",type="string",dest="plotapp") # main def main(): @@ -357,29 +239,15 @@ def main(): printTotalExecutionTimeTable(csvDict) print '' - if options.printTimersVisc or options.printAll: - printTimerDecomposition(csvDict, isVisc=True) + if options.printTimers == "all" or options.printAll: + versions = getAllVersions(csvDict) + for v in versions: + printTimerDecomposition(csvDict, v) print '' - - if options.printTimersOpencl or options.printAll: - printTimerDecomposition(csvDict, isVisc=False) + elif options.printTimers: + printTimerDecomposition(csvDict, options.printTimers) print '' - # plot graphs - - if options.plotTotals or options.plotAll: - plotTotalExecutionTimeTable(csvDict) - - if options.plotapp and not options.plotapp in csvDict.keys(): - print "parboilReader.py: Error: No input file was given!" - raise OSError - elif options.plotapp or options.plotTimers or options.plotAll: - plotTimerDecomposition(csvDict, options.plotapp) - - plt.show() - for i in range(num_figs): - plt.close(i) - if __name__ == '__main__': main()