diff --git a/llvm/include/llvm/SupportVISC/DFG2LLVM.h b/llvm/include/llvm/SupportVISC/DFG2LLVM.h index 6f2aa5d02b333afd247df89a6ad68919972dae7b..3cee47b2085459de2db0e23bd60cb972659443ed 100644 --- a/llvm/include/llvm/SupportVISC/DFG2LLVM.h +++ b/llvm/include/llvm/SupportVISC/DFG2LLVM.h @@ -21,6 +21,9 @@ using namespace llvm; using namespace builddfg; #define TIMER(X) do { if (VISCTimer) { X; } } while (0) +#define DECLARE(X) X = M.getOrInsertFunction(#X, \ + runtimeModule->getFunction(#X)->getFunctionType()); \ + DEBUG(errs() << *X) namespace dfg2llvm { // Helper Functions @@ -79,10 +82,11 @@ protected: Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); void addIdxDimArgs(Function* F); Argument* getArgumentAt(Function* F, unsigned offset); + void initTimerAPI(); // Pure Virtual Functions virtual void init() = 0; - virtual void initRuntimeAPI() = 0;// { errs () << "*******Oops called from base class*******\n";} + virtual void initRuntimeAPI() = 0; virtual void codeGen(DFInternalNode* N) = 0; virtual void codeGen(DFLeafNode* N) = 0; @@ -132,20 +136,13 @@ public: // Generate Code for declaring a constant string [L x i8] and return a pointer // to the start of it. Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { - errs() << "Module pointer: " << &M << "\n"; - Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true); - errs () << "String constant = " << *SConstant << "\n"; Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true, GlobalValue::InternalLinkage, SConstant, Name); - errs () << "String global = " << *SGlobal << "\n"; Value* Zero = ConstantInt::get(Type::getInt64Ty(getGlobalContext()), 0); Value* GEPArgs[] = {Zero, Zero}; - errs () << "Zero = " << *Zero << "\n"; - errs () << "Module = " << *IB->getParent() << "\n"; GetElementPtrInst* SPtr = GetElementPtrInst::Create(SGlobal, ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); - errs() << "String pointer = " << SPtr << "\n"; return SPtr; } @@ -188,6 +185,12 @@ Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { return arg; } +void CodeGenTraversal::initTimerAPI() { + DECLARE(llvm_visc_initializeTimerSet); + DECLARE(llvm_visc_switchToTimer); + DECLARE(llvm_visc_printTimerSet); +} + // Timer Routines // Initialize the timer set void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) { diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index c4c10481cc24ea6fa837e1b812a7f05fb4b5864d..6170edf20bdd0ac76268fc666a3d08ae7a134372 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -977,10 +977,17 @@ void* llvm_visc_ptx_initContext() { DEBUG(cout << "\tEXTENSIONS = " << buffer << "\n"); } // set platform property - just pick the first one + //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + //(long) platforms[0], + //0}; + //globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, + //NULL, NULL, &errcode); + assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); + // Choose second one which is X86 AVX cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - (long) platforms[0], + (long) platforms[1], 0}; - globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, + globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_CPU, NULL, NULL, &errcode); free(platforms); DEBUG(cout << "\tContext " << globalGPUContext << "\n"); @@ -1205,6 +1212,7 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { } Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); + //Context->clKernel = clCreateKernel(Context->clProgram, "mysgemmNT", &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); free(clDevices); diff --git a/llvm/test/VISC/parboil/.ycm_extra_conf.py b/llvm/test/VISC/parboil/.ycm_extra_conf.py index d8ca60904f525f0332daeec3d9297d83c9db0e80..3615b034aa744c2eb4e3e76492745e0f30fb9a0b 100644 --- a/llvm/test/VISC/parboil/.ycm_extra_conf.py +++ b/llvm/test/VISC/parboil/.ycm_extra_conf.py @@ -49,7 +49,8 @@ flags = [ '-I', 'include', '-I.', '-I./include', - '-isystem', '/usr/local/cuda/include' + '-isystem', '/opt/intel/opencl-sdk/include' + '-isystem', '/usr/local/cuda/include', ] # Set this to the absolute path to the folder (NOT the file!) containing the diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/Makefile b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/Makefile index 82f13f063fed479fdf1e88e5a9ab0c842f3ad19a..7a6e03ba2e7460b04f149af07ff98711f801a8f7 100644 --- a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/Makefile @@ -4,7 +4,7 @@ LANGUAGE=opencl TOOLS_SRC=common_src/convert-dataset SRCDIR_OBJS=main.o gpu_info.o file.o ocl.o APP_CUDALDFLAGS=-lm -APP_CFLAGS=-ffast-math -g3 -O3 -I$(TOOLS_SRC) +APP_CFLAGS=-ffast-math -g3 -O3 -I$(TOOLS_SRC) -DOPENCL_CPU APP_CXXFLAGS=-ffast-math -g3 -O3 -include $(TOOLS_SRC)/commontools.mk \ No newline at end of file +include $(TOOLS_SRC)/commontools.mk diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/main.c index 98cc2e1d53a65818f31a4140d0bef14941c2f7ce..6d27a92de7d0896acc7d3ceab2d12855fe7c8f7e 100644 --- a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/main.c +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_cpu/main.c @@ -19,250 +19,259 @@ #include "ocl.h" #include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) +{ + srand(54321); + int i; + for(i=0; i<dim; i++) + { + x_vector[i] = (rand() / (float) RAND_MAX); + } + return 0; } int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("CUDA accelerated sparse matrix vector multiplication****\n"); - printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - printf("This version maintained by Chris Rodrigues ***********\n"); - parameters = pb_ReadParameters(&argc, argv); - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one input filename\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - //parameters declaration - cl_int clStatus; - - cl_uint numPlatforms; - clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); - CHECK_ERROR("clGetPlatformIDs") - - cl_platform_id clPlatform[numPlatforms]; - clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus); - CHECK_ERROR("clCreateKernel") - - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_ptr; - cl_mem d_perm; - cl_mem d_nzcnt; - - //vector - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("CUDA accelerated sparse matrix vector multiplication****\n"); + printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); + printf("This version maintained by Chris Rodrigues ***********\n"); + parameters = pb_ReadParameters(&argc, argv); + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) + { + fprintf(stderr, "Expecting one input filename\n"); + exit(-1); + } + + //load matrix from files + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + int len; + int depth; + int dim; + int pad=32; + int nzcnt_len; + + //host memory allocation + //matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + //vector + float *h_Ax_vector; + float *h_x_vector; + + + coo_to_jds( + parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, + &col_count, &dim, &len, &nzcnt_len, &depth + ); + // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); - - input_vec( parameters->inpFiles[1],h_x_vector,dim); + h_Ax_vector=(float*)malloc(sizeof(float)*dim); + h_x_vector=(float*)malloc(sizeof(float)*dim); + + input_vec( parameters->inpFiles[1],h_x_vector,dim); + + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + //parameters declaration + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + CHECK_ERROR("clGetPlatformIDs") + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + //device memory allocation + //matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_ptr; + cl_mem d_perm; + cl_mem d_nzcnt; + + //vector + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions,""); + clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus); + CHECK_ERROR("clCreateKernel") - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - OpenCLDeviceProp clDeviceProp; + OpenCLDeviceProp clDeviceProp; // clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - //CHECK_ERROR("clGetDeviceInfo") + //CHECK_ERROR("clGetDeviceInfo") // clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - // CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); + // CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); + CHECK_ERROR("clGetDeviceInfo") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + //memory allocation + d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); + + //memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block); // printf("!!! dim is %d\n",dim); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - - int i; - for (i=0; i<50; i++) - { - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") - } - - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") - - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") - - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + //main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + int i; + for (i=0; i<50; i++) + { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + //HtoD memory copy + clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); + CHECK_ERROR("clEnqueueReadBuffer") + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + if (parameters->outFile) { + /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ + outputData(parameters->outFile,h_Ax_vector,dim); + } + + /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ + + free((void*)clSource[0]); + + free (h_data); + free (h_indices); + free (h_ptr); + free (h_perm); + free (h_nzcnt); + free (h_Ax_vector); + free (h_x_vector); + pb_FreeParameters(parameters); + + return 0; } diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c index b42bf009c9f8214a84a93b36bcea564bfb496197..8942564dca33a5f5a18fb9d134c0ccec42a13127 100644 --- a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c @@ -213,7 +213,6 @@ int main(int argc, char** argv) { //main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - printf("grid = %lu, block = %lu\n", grid, block); int i; for(i=0; i<50; i++) diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c index 25d81214dc6356be4c7ed563ca0e919f595df0fc..021c99ece4031a9c982e2a0219e8823c377bcd94 100644 --- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c @@ -175,7 +175,6 @@ int main(int argc, char** argv) { llvm_visc_track_mem(h_x_vector, dim*sizeof(float)); llvm_visc_track_mem(h_ptr, depth*sizeof(int)); llvm_visc_track_mem(h_nzcnt, nzcnt_len*sizeof(int)); - printf("grid = %lu, block = %lu\n", grid, block); //main execution diff --git a/llvm/test/VISC/parboil/common/Makefile.conf b/llvm/test/VISC/parboil/common/Makefile.conf index 31c354f5ee920fd9ae9a763653dfcdb7a4ebef6a..31e0d44d62b351ef3b9e0b1f86647e2ffcdb6211 100644 --- a/llvm/test/VISC/parboil/common/Makefile.conf +++ b/llvm/test/VISC/parboil/common/Makefile.conf @@ -1,5 +1,5 @@ CUDA_PATH=/usr/local/cuda CUDA_LIB_PATH=/usr/local/cuda/lib64 -OPENCL_PATH=/usr/local/cuda +OPENCL_PATH=/opt/intel/opencl-sdk OPENCL_LIB_PATH=/usr/lib64 diff --git a/llvm/test/VISC/parboil/common/include/visc.h b/llvm/test/VISC/parboil/common/include/visc.h index ce3a1f2edbbf5af827f4d1253ededae9fc6ff21c..466bfcac765cb5190f91ab0b77ee1a2d33995ae8 100644 --- a/llvm/test/VISC/parboil/common/include/visc.h +++ b/llvm/test/VISC/parboil/common/include/visc.h @@ -5,12 +5,13 @@ *cr All Rights Reserved *cr ***************************************************************************/ -#include "llvm/SupportVISC/VISCHint.h" #ifdef __cplusplus + +#include "llvm/SupportVISC/VISCHint.h" extern "C" { void __visc__attributes(unsigned, ...); -void __visc__hint(visc::target); +void __visc__hint(visc::Target); void __visc__wait(unsigned); unsigned __visc__node(...); void __visc__init(); diff --git a/llvm/test/VISC/parboil/common/src/parboil_opencl.c b/llvm/test/VISC/parboil/common/src/parboil_opencl.c index 1b038efc0525b511cfa3a6884e14cbdcbef47e92..5f1937f356892489bd78ed5b2fb238d886de2f9a 100644 --- a/llvm/test/VISC/parboil/common/src/parboil_opencl.c +++ b/llvm/test/VISC/parboil/common/src/parboil_opencl.c @@ -232,8 +232,12 @@ pb_Parameters_CountInputs(struct pb_Parameters *p) static int is_async(enum pb_TimerID timer) { +#ifndef OPENCL_CPU return (timer == pb_TimerID_KERNEL) || (timer == pb_TimerID_COPY_ASYNC); +#else + return (timer == pb_TimerID_COPY_ASYNC); +#endif } static int is_blocking(enum pb_TimerID timer)