From bbffd07567475c60b677c91e4488c1fef263ae6c Mon Sep 17 00:00:00 2001 From: Prakalp Srivastava <prakalps@gmail.com> Date: Sat, 13 Jun 2015 19:29:52 -0500 Subject: [PATCH] Modified spmv to use precompiled ptx binary for kernel --- .../spmv/src/opencl_nvidia/kernel_offline.cl | 73 +++ .../src/opencl_nvidia/kernel_offline.nvptx.s | 123 +++++ .../benchmarks/spmv/src/opencl_nvidia/main.c | 469 +++++++++--------- 3 files changed, 432 insertions(+), 233 deletions(-) create mode 100644 llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl create mode 100644 llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.nvptx.s diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl new file mode 100644 index 0000000000..9a17a299af --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl @@ -0,0 +1,73 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +#define WARP_BITS 5 + +__kernel void spmv_jds(__global float *dst_vector, __global float *d_data, + __global int *d_index, __global int *d_perm, + __global float *x_vec, const int dim, + __constant int *jds_ptr_int, + __constant int *sh_zcnt_int) +{ + int ix = get_global_id(0); + int warp_id=ix>>WARP_BITS; + + if(ix<dim) + { + float sum=0.0f; + int bound=sh_zcnt_int[warp_id]; + //prefetch 0 + int j=jds_ptr_int[0]+ix; + float d = d_data[j]; + int i = d_index[j]; + float t = x_vec[i]; + + if (bound>1) //bound >=2 + { + //prefetch 1 + j=jds_ptr_int[1]+ix; + i = d_index[j]; + int in; + float dn; + float tn; + for(int k=2;k<bound;k++ ) + { + //prefetch k-1 + dn = d_data[j]; + //prefetch k + j=jds_ptr_int[k]+ix; + in = d_index[j]; + //prefetch k-1 + tn = x_vec[i]; + + //compute k-2 + sum += d*t; + //sweep to k + i = in; + //sweep to k-1 + d = dn; + t =tn; + } + + //fetch last + dn = d_data[j]; + tn = x_vec[i]; + + //compute last-1 + sum += d*t; + //sweep to last + d=dn; + t=tn; + } + //compute last + sum += d*t; // 3 3 + + //write out data + dst_vector[d_perm[ix]]=sum; + } +} + diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.nvptx.s b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.nvptx.s new file mode 100644 index 0000000000..e65b07e909 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.nvptx.s @@ -0,0 +1,123 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 3.1 +.target sm_20, texmode_independent +.address_size 32 + + // .globl spmv_jds + +.entry spmv_jds( + .param .u32 .ptr .global .align 4 spmv_jds_param_0, + .param .u32 .ptr .global .align 4 spmv_jds_param_1, + .param .u32 .ptr .global .align 4 spmv_jds_param_2, + .param .u32 .ptr .global .align 4 spmv_jds_param_3, + .param .u32 .ptr .global .align 4 spmv_jds_param_4, + .param .u32 spmv_jds_param_5, + .param .u32 .ptr .global .align 4 spmv_jds_param_6, + .param .u32 .ptr .global .align 4 spmv_jds_param_7 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<39>; + .reg .s32 %r<56>; + + mov.u32 %r20, %ctaid.x; + mov.u32 %r21, %ntid.x; + mov.u32 %r22, %tid.x; + mad.lo.s32 %r1, %r21, %r20, %r22; + ld.param.u32 %r23, [spmv_jds_param_5]; + setp.ge.s32 %p1, %r1, %r23; + @%p1 bra BB0_9; + ld.param.u32 %r13, [spmv_jds_param_0]; + ld.param.u32 %r14, [spmv_jds_param_1]; + ld.param.u32 %r15, [spmv_jds_param_2]; + ld.param.u32 %r16, [spmv_jds_param_3]; + ld.param.u32 %r17, [spmv_jds_param_4]; + ld.param.u32 %r18, [spmv_jds_param_6]; + ld.param.u32 %r19, [spmv_jds_param_7]; + shr.s32 %r24, %r1, 5; + shl.b32 %r25, %r24, 2; + add.s32 %r26, %r19, %r25; + ld.global.u32 %r2, [%r26]; + ld.global.u32 %r27, [%r18]; + add.s32 %r28, %r27, %r1; + shl.b32 %r29, %r28, 2; + add.s32 %r30, %r14, %r29; + ld.global.f32 %f1, [%r30]; + add.s32 %r31, %r15, %r29; + ld.global.u32 %r32, [%r31]; + shl.b32 %r33, %r32, 2; + add.s32 %r34, %r17, %r33; + ld.global.f32 %f2, [%r34]; + setp.lt.s32 %p2, %r2, 2; + @%p2 bra BB0_2; + ld.global.u32 %r35, [%r18+4]; + add.s32 %r36, %r35, %r1; + shl.b32 %r37, %r36, 2; + add.s32 %r38, %r15, %r37; + ld.global.u32 %r3, [%r38]; + add.s32 %r39, %r14, %r37; + ld.global.f32 %f3, [%r39]; + setp.lt.s32 %p3, %r2, 3; + @%p3 bra BB0_4; + add.s32 %r4, %r2, -2; + add.s32 %r5, %r18, 8; + mov.f32 %f22, 0f00000000; + mov.f32 %f24, %f3; + mov.f32 %f29, %f1; +BB0_6: + mov.f32 %f7, %f29; + mov.f32 %f35, %f24; + shl.b32 %r40, %r3, 2; + add.s32 %r41, %r17, %r40; + ld.global.f32 %f8, [%r41]; + fma.rn.f32 %f9, %f2, %f7, %f22; + ld.global.u32 %r42, [%r5]; + add.s32 %r43, %r42, %r1; + shl.b32 %r44, %r43, 2; + add.s32 %r45, %r15, %r44; + ld.global.u32 %r3, [%r45]; + add.s32 %r46, %r14, %r44; + ld.global.f32 %f10, [%r46]; + add.s32 %r10, %r4, -1; + add.s32 %r11, %r5, 4; + setp.ne.s32 %p4, %r10, 0; + mov.u32 %r5, %r11; + mov.u32 %r4, %r10; + mov.f32 %f24, %f10; + mov.f32 %f22, %f9; + mov.f32 %f2, %f8; + mov.f32 %f29, %f35; + mov.f32 %f32, %f10; + @%p4 bra BB0_6; + bra.uni BB0_7; +BB0_2: + mov.f32 %f20, 0f00000000; + mov.f32 %f37, %f1; + bra.uni BB0_8; +BB0_4: + mov.f32 %f22, 0f00000000; + mov.f32 %f32, %f3; + mov.f32 %f35, %f1; +BB0_7: + mov.f32 %f14, %f35; + mov.f32 %f37, %f32; + shl.b32 %r47, %r3, 2; + add.s32 %r48, %r17, %r47; + ld.global.f32 %f15, [%r48]; + fma.rn.f32 %f20, %f2, %f14, %f22; + mov.f32 %f2, %f15; +BB0_8: + fma.rn.f32 %f23, %f37, %f2, %f20; + shl.b32 %r49, %r1, 2; + add.s32 %r50, %r16, %r49; + ld.global.u32 %r51, [%r50]; + shl.b32 %r52, %r51, 2; + add.s32 %r53, %r13, %r52; + st.global.f32 [%r53], %f23; +BB0_9: + ret; +} + diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c index 661fc7a2c0..155dcb1575 100644 --- a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c @@ -18,253 +18,256 @@ #include "convert_dataset.h" -static int generate_vector(float *x_vector, int dim) -{ - srand(54321); - int i; - for(i=0;i<dim;i++) - { - x_vector[i] = (rand() / (float) RAND_MAX); - } - return 0; +static int generate_vector(float *x_vector, int dim) +{ + srand(54321); + int i; + for(i=0; i<dim; i++) + { + x_vector[i] = (rand() / (float) RAND_MAX); + } + return 0; } int main(int argc, char** argv) { - struct pb_TimerSet timers; - struct pb_Parameters *parameters; - - printf("OpenCL accelerated sparse matrix vector multiplication****\n"); - printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); - parameters = pb_ReadParameters(&argc, argv); - - if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) - { - fprintf(stderr, "Expecting one two filenames\n"); - exit(-1); - } - - pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - - cl_int clStatus; - cl_platform_id clPlatform; - clStatus = clGetPlatformIDs(1,&clPlatform,NULL); - CHECK_ERROR("clGetPlatformIDs") - - cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; - - cl_device_id clDevice; - clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); - CHECK_ERROR("clGetDeviceIDs") - - cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") - - cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") - - pb_SetOpenCL(&clContext, &clCommandQueue); - - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") - - char clOptions[50]; - sprintf(clOptions,""); - clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); - CHECK_ERROR("clCreateKernel") + struct pb_TimerSet timers; + struct pb_Parameters *parameters; + + printf("OpenCL accelerated sparse matrix vector multiplication****\n"); + printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); + parameters = pb_ReadParameters(&argc, argv); + + if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) + { + fprintf(stderr, "Expecting one two filenames\n"); + exit(-1); + } + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + + cl_int clStatus; + cl_platform_id clPlatform; + clStatus = clGetPlatformIDs(1,&clPlatform,NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + //CHECK_ERROR("clCreateProgramWithSource") + + //char clOptions[50]; + //sprintf(clOptions,""); + //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + //CHECK_ERROR("clBuildProgram") + + //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); + //CHECK_ERROR("clCreateKernel") + cl_kernel clKernel; + cl_program clProgram; + pb_CreateAndBuildKernelFromBinary("src/opencl_nvidia/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //parameters declaration - int len; - int depth; - int dim; - int pad=32; - int nzcnt_len; - - //host memory allocation - //matrix - float *h_data; - int *h_indices; - int *h_ptr; - int *h_perm; - int *h_nzcnt; - - //vector - float *h_Ax_vector; - float *h_x_vector; - - //device memory allocation - //matrix - cl_mem d_data; - cl_mem d_indices; - cl_mem d_perm; - cl_mem d_nzcnt; - cl_mem d_Ax_vector; - cl_mem d_x_vector; - - cl_mem jds_ptr_int; - cl_mem sh_zcnt_int; - - //load matrix from files - pb_SwitchToTimer(&timers, pb_TimerID_IO); - //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, - // &h_data, &h_indices, &h_ptr, - // &h_perm, &h_nzcnt); - int col_count; - - coo_to_jds( - parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx - 1, // row padding - pad, // warp size - 1, // pack size - 1, // is mirrored? - 0, // binary matrix - 1, // debug level [0:2] - &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, - &col_count, &dim, &len, &nzcnt_len, &depth - ); - - - h_Ax_vector=(float*)malloc(sizeof(float)*dim); - h_x_vector=(float*)malloc(sizeof(float)*dim); + //parameters declaration + int len; + int depth; + int dim; + int pad=32; + int nzcnt_len; + + //host memory allocation + //matrix + float *h_data; + int *h_indices; + int *h_ptr; + int *h_perm; + int *h_nzcnt; + + //vector + float *h_Ax_vector; + float *h_x_vector; + + //device memory allocation + //matrix + cl_mem d_data; + cl_mem d_indices; + cl_mem d_perm; + cl_mem d_nzcnt; + cl_mem d_Ax_vector; + cl_mem d_x_vector; + + cl_mem jds_ptr_int; + cl_mem sh_zcnt_int; + + //load matrix from files + pb_SwitchToTimer(&timers, pb_TimerID_IO); + //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, + // &h_data, &h_indices, &h_ptr, + // &h_perm, &h_nzcnt); + int col_count; + + coo_to_jds( + parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx + 1, // row padding + pad, // warp size + 1, // pack size + 1, // is mirrored? + 0, // binary matrix + 1, // debug level [0:2] + &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, + &col_count, &dim, &len, &nzcnt_len, &depth + ); + + + h_Ax_vector=(float*)malloc(sizeof(float)*dim); + h_x_vector=(float*)malloc(sizeof(float)*dim); input_vec( parameters->inpFiles[1],h_x_vector,dim); pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCLDeviceProp clDeviceProp; - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); - CHECK_ERROR("clGetDeviceInfo") - clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); - CHECK_ERROR("clGetDeviceInfo") - - -pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //memory allocation - d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); - CHECK_ERROR("clCreateBuffer") - - clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); - - //memory copy - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - size_t grid; - size_t block; - - compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); - CHECK_ERROR("clSetKernelArg") - - clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); - CHECK_ERROR("clSetKernelArg") - clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); - CHECK_ERROR("clSetKernelArg") - - //main execution - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - printf("grid = %lu, block = %lu\n", grid, block); - - int i; - for(i=0; i<50; i++) - { + OpenCLDeviceProp clDeviceProp; + clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); + CHECK_ERROR("clGetDeviceInfo") + clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); + CHECK_ERROR("clGetDeviceInfo") + + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + //memory allocation + d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); + + //memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + size_t grid; + size_t block; + + compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); + CHECK_ERROR("clSetKernelArg") + + clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); + CHECK_ERROR("clSetKernelArg") + clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); + CHECK_ERROR("clSetKernelArg") + + //main execution + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + printf("grid = %lu, block = %lu\n", grid, block); + + int i; + for(i=0; i<50; i++) + { pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") + clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - } + } + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") - clStatus = clFinish(clCommandQueue); - CHECK_ERROR("clFinish") + pb_SwitchToTimer(&timers, pb_TimerID_COPY); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - //HtoD memory copy - clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); - CHECK_ERROR("clEnqueueReadBuffer") + //HtoD memory copy + clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); + CHECK_ERROR("clEnqueueReadBuffer") pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - clStatus = clReleaseKernel(clKernel); - clStatus = clReleaseProgram(clProgram); - - clStatus = clReleaseMemObject(d_data); - clStatus = clReleaseMemObject(d_indices); - clStatus = clReleaseMemObject(d_perm); - clStatus = clReleaseMemObject(d_nzcnt); - clStatus = clReleaseMemObject(d_x_vector); - clStatus = clReleaseMemObject(d_Ax_vector); - CHECK_ERROR("clReleaseMemObject") - - clStatus = clReleaseCommandQueue(clCommandQueue); - clStatus = clReleaseContext(clContext); - - if (parameters->outFile) { - pb_SwitchToTimer(&timers, pb_TimerID_IO); - outputData(parameters->outFile,h_Ax_vector,dim); - - } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - - free((void*)clSource[0]); - - free (h_data); - free (h_indices); - free (h_ptr); - free (h_perm); - free (h_nzcnt); - free (h_Ax_vector); - free (h_x_vector); - - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - - pb_PrintTimerSet(&timers); - pb_FreeParameters(parameters); - - return 0; + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + + clStatus = clReleaseMemObject(d_data); + clStatus = clReleaseMemObject(d_indices); + clStatus = clReleaseMemObject(d_perm); + clStatus = clReleaseMemObject(d_nzcnt); + clStatus = clReleaseMemObject(d_x_vector); + clStatus = clReleaseMemObject(d_Ax_vector); + CHECK_ERROR("clReleaseMemObject") + + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + if (parameters->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + outputData(parameters->outFile,h_Ax_vector,dim); + + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + //free((void*)clSource[0]); + + free (h_data); + free (h_indices); + free (h_ptr); + free (h_perm); + free (h_nzcnt); + free (h_Ax_vector); + free (h_x_vector); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; } -- GitLab