Skip to content
Snippets Groups Projects
Commit bbffd075 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

Modified spmv to use precompiled ptx binary for kernel

parent dbd5168a
No related branches found
No related tags found
No related merge requests found
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#define WARP_BITS 5
__kernel void spmv_jds(__global float *dst_vector, __global float *d_data,
__global int *d_index, __global int *d_perm,
__global float *x_vec, const int dim,
__constant int *jds_ptr_int,
__constant int *sh_zcnt_int)
{
int ix = get_global_id(0);
int warp_id=ix>>WARP_BITS;
if(ix<dim)
{
float sum=0.0f;
int bound=sh_zcnt_int[warp_id];
//prefetch 0
int j=jds_ptr_int[0]+ix;
float d = d_data[j];
int i = d_index[j];
float t = x_vec[i];
if (bound>1) //bound >=2
{
//prefetch 1
j=jds_ptr_int[1]+ix;
i = d_index[j];
int in;
float dn;
float tn;
for(int k=2;k<bound;k++ )
{
//prefetch k-1
dn = d_data[j];
//prefetch k
j=jds_ptr_int[k]+ix;
in = d_index[j];
//prefetch k-1
tn = x_vec[i];
//compute k-2
sum += d*t;
//sweep to k
i = in;
//sweep to k-1
d = dn;
t =tn;
}
//fetch last
dn = d_data[j];
tn = x_vec[i];
//compute last-1
sum += d*t;
//sweep to last
d=dn;
t=tn;
}
//compute last
sum += d*t; // 3 3
//write out data
dst_vector[d_perm[ix]]=sum;
}
}
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 32
// .globl spmv_jds
.entry spmv_jds(
.param .u32 .ptr .global .align 4 spmv_jds_param_0,
.param .u32 .ptr .global .align 4 spmv_jds_param_1,
.param .u32 .ptr .global .align 4 spmv_jds_param_2,
.param .u32 .ptr .global .align 4 spmv_jds_param_3,
.param .u32 .ptr .global .align 4 spmv_jds_param_4,
.param .u32 spmv_jds_param_5,
.param .u32 .ptr .global .align 4 spmv_jds_param_6,
.param .u32 .ptr .global .align 4 spmv_jds_param_7
)
{
.reg .pred %p<5>;
.reg .f32 %f<39>;
.reg .s32 %r<56>;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %tid.x;
mad.lo.s32 %r1, %r21, %r20, %r22;
ld.param.u32 %r23, [spmv_jds_param_5];
setp.ge.s32 %p1, %r1, %r23;
@%p1 bra BB0_9;
ld.param.u32 %r13, [spmv_jds_param_0];
ld.param.u32 %r14, [spmv_jds_param_1];
ld.param.u32 %r15, [spmv_jds_param_2];
ld.param.u32 %r16, [spmv_jds_param_3];
ld.param.u32 %r17, [spmv_jds_param_4];
ld.param.u32 %r18, [spmv_jds_param_6];
ld.param.u32 %r19, [spmv_jds_param_7];
shr.s32 %r24, %r1, 5;
shl.b32 %r25, %r24, 2;
add.s32 %r26, %r19, %r25;
ld.global.u32 %r2, [%r26];
ld.global.u32 %r27, [%r18];
add.s32 %r28, %r27, %r1;
shl.b32 %r29, %r28, 2;
add.s32 %r30, %r14, %r29;
ld.global.f32 %f1, [%r30];
add.s32 %r31, %r15, %r29;
ld.global.u32 %r32, [%r31];
shl.b32 %r33, %r32, 2;
add.s32 %r34, %r17, %r33;
ld.global.f32 %f2, [%r34];
setp.lt.s32 %p2, %r2, 2;
@%p2 bra BB0_2;
ld.global.u32 %r35, [%r18+4];
add.s32 %r36, %r35, %r1;
shl.b32 %r37, %r36, 2;
add.s32 %r38, %r15, %r37;
ld.global.u32 %r3, [%r38];
add.s32 %r39, %r14, %r37;
ld.global.f32 %f3, [%r39];
setp.lt.s32 %p3, %r2, 3;
@%p3 bra BB0_4;
add.s32 %r4, %r2, -2;
add.s32 %r5, %r18, 8;
mov.f32 %f22, 0f00000000;
mov.f32 %f24, %f3;
mov.f32 %f29, %f1;
BB0_6:
mov.f32 %f7, %f29;
mov.f32 %f35, %f24;
shl.b32 %r40, %r3, 2;
add.s32 %r41, %r17, %r40;
ld.global.f32 %f8, [%r41];
fma.rn.f32 %f9, %f2, %f7, %f22;
ld.global.u32 %r42, [%r5];
add.s32 %r43, %r42, %r1;
shl.b32 %r44, %r43, 2;
add.s32 %r45, %r15, %r44;
ld.global.u32 %r3, [%r45];
add.s32 %r46, %r14, %r44;
ld.global.f32 %f10, [%r46];
add.s32 %r10, %r4, -1;
add.s32 %r11, %r5, 4;
setp.ne.s32 %p4, %r10, 0;
mov.u32 %r5, %r11;
mov.u32 %r4, %r10;
mov.f32 %f24, %f10;
mov.f32 %f22, %f9;
mov.f32 %f2, %f8;
mov.f32 %f29, %f35;
mov.f32 %f32, %f10;
@%p4 bra BB0_6;
bra.uni BB0_7;
BB0_2:
mov.f32 %f20, 0f00000000;
mov.f32 %f37, %f1;
bra.uni BB0_8;
BB0_4:
mov.f32 %f22, 0f00000000;
mov.f32 %f32, %f3;
mov.f32 %f35, %f1;
BB0_7:
mov.f32 %f14, %f35;
mov.f32 %f37, %f32;
shl.b32 %r47, %r3, 2;
add.s32 %r48, %r17, %r47;
ld.global.f32 %f15, [%r48];
fma.rn.f32 %f20, %f2, %f14, %f22;
mov.f32 %f2, %f15;
BB0_8:
fma.rn.f32 %f23, %f37, %f2, %f20;
shl.b32 %r49, %r1, 2;
add.s32 %r50, %r16, %r49;
ld.global.u32 %r51, [%r50];
shl.b32 %r52, %r51, 2;
add.s32 %r53, %r13, %r52;
st.global.f32 [%r53], %f23;
BB0_9:
ret;
}
...@@ -18,253 +18,256 @@ ...@@ -18,253 +18,256 @@
#include "convert_dataset.h" #include "convert_dataset.h"
static int generate_vector(float *x_vector, int dim) static int generate_vector(float *x_vector, int dim)
{ {
srand(54321); srand(54321);
int i; int i;
for(i=0;i<dim;i++) for(i=0; i<dim; i++)
{ {
x_vector[i] = (rand() / (float) RAND_MAX); x_vector[i] = (rand() / (float) RAND_MAX);
} }
return 0; return 0;
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
struct pb_TimerSet timers; struct pb_TimerSet timers;
struct pb_Parameters *parameters; struct pb_Parameters *parameters;
printf("OpenCL accelerated sparse matrix vector multiplication****\n"); printf("OpenCL accelerated sparse matrix vector multiplication****\n");
printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n"); printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
parameters = pb_ReadParameters(&argc, argv); parameters = pb_ReadParameters(&argc, argv);
if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
{ {
fprintf(stderr, "Expecting one two filenames\n"); fprintf(stderr, "Expecting one two filenames\n");
exit(-1); exit(-1);
} }
pb_InitializeTimerSet(&timers); pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, visc_TimerID_SETUP); pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
cl_int clStatus; cl_int clStatus;
cl_platform_id clPlatform; cl_platform_id clPlatform;
clStatus = clGetPlatformIDs(1,&clPlatform,NULL); clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
CHECK_ERROR("clGetPlatformIDs") CHECK_ERROR("clGetPlatformIDs")
cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0}; cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
cl_device_id clDevice; cl_device_id clDevice;
clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL); clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
CHECK_ERROR("clGetDeviceIDs") CHECK_ERROR("clGetDeviceIDs")
cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
CHECK_ERROR("clCreateContextFromType") CHECK_ERROR("clCreateContextFromType")
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue") CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue); pb_SetOpenCL(&clContext, &clCommandQueue);
const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
CHECK_ERROR("clCreateProgramWithSource") //CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50]; //char clOptions[50];
sprintf(clOptions,""); //sprintf(clOptions,"");
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
CHECK_ERROR("clBuildProgram") //CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus); //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
CHECK_ERROR("clCreateKernel") //CHECK_ERROR("clCreateKernel")
cl_kernel clKernel;
cl_program clProgram;
pb_CreateAndBuildKernelFromBinary("src/opencl_nvidia/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
//parameters declaration //parameters declaration
int len; int len;
int depth; int depth;
int dim; int dim;
int pad=32; int pad=32;
int nzcnt_len; int nzcnt_len;
//host memory allocation //host memory allocation
//matrix //matrix
float *h_data; float *h_data;
int *h_indices; int *h_indices;
int *h_ptr; int *h_ptr;
int *h_perm; int *h_perm;
int *h_nzcnt; int *h_nzcnt;
//vector //vector
float *h_Ax_vector; float *h_Ax_vector;
float *h_x_vector; float *h_x_vector;
//device memory allocation //device memory allocation
//matrix //matrix
cl_mem d_data; cl_mem d_data;
cl_mem d_indices; cl_mem d_indices;
cl_mem d_perm; cl_mem d_perm;
cl_mem d_nzcnt; cl_mem d_nzcnt;
cl_mem d_Ax_vector; cl_mem d_Ax_vector;
cl_mem d_x_vector; cl_mem d_x_vector;
cl_mem jds_ptr_int; cl_mem jds_ptr_int;
cl_mem sh_zcnt_int; cl_mem sh_zcnt_int;
//load matrix from files //load matrix from files
pb_SwitchToTimer(&timers, pb_TimerID_IO); pb_SwitchToTimer(&timers, pb_TimerID_IO);
//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
// &h_data, &h_indices, &h_ptr, // &h_data, &h_indices, &h_ptr,
// &h_perm, &h_nzcnt); // &h_perm, &h_nzcnt);
int col_count; int col_count;
coo_to_jds( coo_to_jds(
parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
1, // row padding 1, // row padding
pad, // warp size pad, // warp size
1, // pack size 1, // pack size
1, // is mirrored? 1, // is mirrored?
0, // binary matrix 0, // binary matrix
1, // debug level [0:2] 1, // debug level [0:2]
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
&col_count, &dim, &len, &nzcnt_len, &depth &col_count, &dim, &len, &nzcnt_len, &depth
); );
h_Ax_vector=(float*)malloc(sizeof(float)*dim); h_Ax_vector=(float*)malloc(sizeof(float)*dim);
h_x_vector=(float*)malloc(sizeof(float)*dim); h_x_vector=(float*)malloc(sizeof(float)*dim);
input_vec( parameters->inpFiles[1],h_x_vector,dim); input_vec( parameters->inpFiles[1],h_x_vector,dim);
pb_SwitchToTimer(&timers, visc_TimerID_SETUP); pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
OpenCLDeviceProp clDeviceProp; OpenCLDeviceProp clDeviceProp;
clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL); clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
CHECK_ERROR("clGetDeviceInfo") CHECK_ERROR("clGetDeviceInfo")
clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL); clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
CHECK_ERROR("clGetDeviceInfo") CHECK_ERROR("clGetDeviceInfo")
clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL); clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
CHECK_ERROR("clGetDeviceInfo") CHECK_ERROR("clGetDeviceInfo")
pb_SwitchToTimer(&timers, pb_TimerID_COPY); pb_SwitchToTimer(&timers, pb_TimerID_COPY);
//memory allocation //memory allocation
d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus); d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus); d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus); d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus); d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus); d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus); sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer") CHECK_ERROR("clCreateBuffer")
clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float)); clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
//memory copy //memory copy
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL); clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer") CHECK_ERROR("clEnqueueWriteBuffer")
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
size_t grid; size_t grid;
size_t block; size_t block;
compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount); compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
pb_SwitchToTimer(&timers, visc_TimerID_SETUP); pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector); clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data); clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices); clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm); clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector); clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim); clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int); clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int); clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
CHECK_ERROR("clSetKernelArg") CHECK_ERROR("clSetKernelArg")
//main execution //main execution
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
printf("grid = %lu, block = %lu\n", grid, block); printf("grid = %lu, block = %lu\n", grid, block);
int i; int i;
for(i=0; i<50; i++) for(i=0; i<50; i++)
{ {
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL); clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel") CHECK_ERROR("clEnqueueNDRangeKernel")
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
} }
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
clStatus = clFinish(clCommandQueue); pb_SwitchToTimer(&timers, pb_TimerID_COPY);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
//HtoD memory copy //HtoD memory copy
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL); clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer") CHECK_ERROR("clEnqueueReadBuffer")
pb_SwitchToTimer(&timers, visc_TimerID_SETUP); pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
clStatus = clReleaseKernel(clKernel); clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram); clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseMemObject(d_data); clStatus = clReleaseMemObject(d_data);
clStatus = clReleaseMemObject(d_indices); clStatus = clReleaseMemObject(d_indices);
clStatus = clReleaseMemObject(d_perm); clStatus = clReleaseMemObject(d_perm);
clStatus = clReleaseMemObject(d_nzcnt); clStatus = clReleaseMemObject(d_nzcnt);
clStatus = clReleaseMemObject(d_x_vector); clStatus = clReleaseMemObject(d_x_vector);
clStatus = clReleaseMemObject(d_Ax_vector); clStatus = clReleaseMemObject(d_Ax_vector);
CHECK_ERROR("clReleaseMemObject") CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseCommandQueue(clCommandQueue); clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext); clStatus = clReleaseContext(clContext);
if (parameters->outFile) { if (parameters->outFile) {
pb_SwitchToTimer(&timers, pb_TimerID_IO); pb_SwitchToTimer(&timers, pb_TimerID_IO);
outputData(parameters->outFile,h_Ax_vector,dim); outputData(parameters->outFile,h_Ax_vector,dim);
} }
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
free((void*)clSource[0]); //free((void*)clSource[0]);
free (h_data); free (h_data);
free (h_indices); free (h_indices);
free (h_ptr); free (h_ptr);
free (h_perm); free (h_perm);
free (h_nzcnt); free (h_nzcnt);
free (h_Ax_vector); free (h_Ax_vector);
free (h_x_vector); free (h_x_vector);
pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers); pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters); pb_FreeParameters(parameters);
return 0; return 0;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment