Skip to content
Snippets Groups Projects
Commit c988910f authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

moved spmv to c++. Required for dummy visc calls

parent 4740089d
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,6 @@ SRCDIR_OBJS=gpu_info.ll file.ll
VISC_OBJS=main.visc.ll
APP_CUDALDFLAGS=-lm
APP_CFLAGS=-ffast-math -O3 -I$(TOOLS_SRC)
APP_CXXFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3 -I$(TOOLS_SRC)
include $(TOOLS_SRC)/commontools.mk
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <CL/cl.h>
//#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <parboil.h>
#include <visc.h>
#include "file.h"
#include "gpu_info.h"
#include "convert_dataset.h"
#define WARP_BITS 5
static int generate_vector(float *x_vector, int dim)
{
srand(54321);
int i;
for(i=0; i<dim; i++)
{
x_vector[i] = (rand() / (float) RAND_MAX);
}
return 0;
}
void spmv_jds(float *dst_vector, float *d_data,
int *d_index, int *d_perm,
float *x_vec, int dim,
int *jds_ptr_int,
int *sh_zcnt_int)
{
__visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
1, dst_vector);
int ix = get_global_id(0);
int warp_id=ix>>WARP_BITS;
if(ix<dim)
{
float sum=0.0f;
int bound=sh_zcnt_int[warp_id];
//prefetch 0
int j=jds_ptr_int[0]+ix;
float d = d_data[j];
int i = d_index[j];
float t = x_vec[i];
if (bound>1) //bound >=2
{
//prefetch 1
j=jds_ptr_int[1]+ix;
i = d_index[j];
int in;
float dn;
float tn;
for(int k=2; k<bound; k++ )
{
//prefetch k-1
dn = d_data[j];
//prefetch k
j=jds_ptr_int[k]+ix;
in = d_index[j];
//prefetch k-1
tn = x_vec[i];
//compute k-2
sum += d*t;
//sweep to k
i = in;
//sweep to k-1
d = dn;
t =tn;
}
//fetch last
dn = d_data[j];
tn = x_vec[i];
//compute last-1
sum += d*t;
//sweep to last
d=dn;
t=tn;
}
//compute last
sum += d*t; // 3 3
//write out data
dst_vector[d_perm[ix]]=sum;
}
}
int main(int argc, char** argv) {
struct pb_TimerSet timers;
struct pb_Parameters *parameters;
printf("OpenCL accelerated sparse matrix vector multiplication****\n");
printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
parameters = pb_ReadParameters(&argc, argv);
if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
{
fprintf(stderr, "Expecting one two filenames\n");
exit(-1);
}
/*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
//parameters declaration
int len;
int depth;
int dim;
int pad=32;
int nzcnt_len;
//host memory allocation
//matrix
float *h_data;
int *h_indices;
int *h_ptr;
int *h_perm;
int *h_nzcnt;
//vector
float *h_Ax_vector;
float *h_x_vector;
//load matrix from files
/*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
// &h_data, &h_indices, &h_ptr,
// &h_perm, &h_nzcnt);
int col_count;
coo_to_jds(
parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
1, // row padding
pad, // warp size
1, // pack size
1, // is mirrored?
0, // binary matrix
1, // debug level [0:2]
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
&col_count, &dim, &len, &nzcnt_len, &depth
);
h_Ax_vector=(float*)malloc(sizeof(float)*dim);
h_x_vector=(float*)malloc(sizeof(float)*dim);
input_vec( parameters->inpFiles[1],h_x_vector,dim);
pb_InitializeTimerSet(&timers);
__visc__init();
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
memset(h_Ax_vector, 0, dim*sizeof(float));
size_t grid;
size_t block;
compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
llvm_visc_track_mem(h_Ax_vector, dim*sizeof(float));
llvm_visc_track_mem(h_data, len*sizeof(float));
llvm_visc_track_mem(h_indices, len*sizeof(int));
llvm_visc_track_mem(h_perm, dim*sizeof(int));
llvm_visc_track_mem(h_x_vector, dim*sizeof(float));
llvm_visc_track_mem(h_ptr, depth*sizeof(int));
llvm_visc_track_mem(h_nzcnt, nzcnt_len*sizeof(int));
//main execution
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
int i;
for(i=0; i<50; i++)
{
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
unsigned spmvDFG = __visc__node(spmv_jds, 2, 1, (unsigned)block, (unsigned)(grid/block),
15,
h_Ax_vector,
dim*sizeof(float),
h_data,
len*sizeof(float),
h_indices,
len*sizeof(int),
h_perm,
dim*sizeof(int),
h_x_vector,
dim*sizeof(float),
dim,
h_ptr,
depth*sizeof(int),
h_nzcnt,
nzcnt_len*sizeof(int),
0);
__visc__wait(spmvDFG);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/******************************* Issues *******************************
* 1. Using OpenCL to compute grid and block dimensions
* (getting device info)
* We need to check the GPU version (major number) where this kernel
* executes to compare against opencl_nvidia version
* 2. Type of cl_mem buffer for d_x_vector is created with size of float,
but copied in through size of int.
Due to type of h_x_vector, I chose to use float
* (Minor)
* 3. Kernel initially used constant memory for last two arguments - removed
*/
}
//HtoD memory copy
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
llvm_visc_untrack_mem(h_Ax_vector);
llvm_visc_untrack_mem(h_data);
llvm_visc_untrack_mem(h_indices);
llvm_visc_untrack_mem(h_perm);
llvm_visc_untrack_mem(h_x_vector);
llvm_visc_untrack_mem(h_ptr);
llvm_visc_untrack_mem(h_nzcnt);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__visc__cleanup();
if (parameters->outFile) {
/*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
outputData(parameters->outFile,h_Ax_vector,dim);
}
free (h_data);
free (h_indices);
free (h_ptr);
free (h_perm);
free (h_nzcnt);
free (h_Ax_vector);
free (h_x_vector);
pb_FreeParameters(parameters);
return 0;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <CL/cl.h>
//#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <parboil.h>
#include <visc.h>
#include "file.h"
#include "gpu_info.h"
#include "convert_dataset.h"
#define WARP_BITS 5
static int generate_vector(float *x_vector, int dim)
{
srand(54321);
int i;
for(i=0; i<dim; i++)
{
x_vector[i] = (rand() / (float) RAND_MAX);
}
return 0;
}
void spmv_jds(float *dst_vector, float *d_data,
int *d_index, int *d_perm,
float *x_vec, int dim,
int *jds_ptr_int,
int *sh_zcnt_int)
{
__visc__hint(visc::DEVICE);
__visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
1, dst_vector);
int ix = get_global_id(0);
int warp_id=ix>>WARP_BITS;
if(ix<dim)
{
float sum=0.0f;
int bound=sh_zcnt_int[warp_id];
//prefetch 0
int j=jds_ptr_int[0]+ix;
float d = d_data[j];
int i = d_index[j];
float t = x_vec[i];
if (bound>1) //bound >=2
{
//prefetch 1
j=jds_ptr_int[1]+ix;
i = d_index[j];
int in;
float dn;
float tn;
for(int k=2; k<bound; k++ )
{
//prefetch k-1
dn = d_data[j];
//prefetch k
j=jds_ptr_int[k]+ix;
in = d_index[j];
//prefetch k-1
tn = x_vec[i];
//compute k-2
sum += d*t;
//sweep to k
i = in;
//sweep to k-1
d = dn;
t =tn;
}
//fetch last
dn = d_data[j];
tn = x_vec[i];
//compute last-1
sum += d*t;
//sweep to last
d=dn;
t=tn;
}
//compute last
sum += d*t; // 3 3
//write out data
dst_vector[d_perm[ix]]=sum;
}
}
int main(int argc, char** argv) {
struct pb_TimerSet timers;
struct pb_Parameters *parameters;
printf("OpenCL accelerated sparse matrix vector multiplication****\n");
printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
parameters = pb_ReadParameters(&argc, argv);
if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
{
fprintf(stderr, "Expecting one two filenames\n");
exit(-1);
}
/*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
//parameters declaration
int len;
int depth;
int dim;
int pad=32;
int nzcnt_len;
//host memory allocation
//matrix
float *h_data;
int *h_indices;
int *h_ptr;
int *h_perm;
int *h_nzcnt;
//vector
float *h_Ax_vector;
float *h_x_vector;
//load matrix from files
/*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
// &h_data, &h_indices, &h_ptr,
// &h_perm, &h_nzcnt);
int col_count;
coo_to_jds(
parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
1, // row padding
pad, // warp size
1, // pack size
1, // is mirrored?
0, // binary matrix
1, // debug level [0:2]
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
&col_count, &dim, &len, &nzcnt_len, &depth
);
h_Ax_vector=(float*)malloc(sizeof(float)*dim);
h_x_vector=(float*)malloc(sizeof(float)*dim);
input_vec( parameters->inpFiles[1],h_x_vector,dim);
pb_InitializeTimerSet(&timers);
__visc__init();
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
memset(h_Ax_vector, 0, dim*sizeof(float));
size_t grid;
size_t block;
compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
llvm_visc_track_mem(h_Ax_vector, dim*sizeof(float));
llvm_visc_track_mem(h_data, len*sizeof(float));
llvm_visc_track_mem(h_indices, len*sizeof(int));
llvm_visc_track_mem(h_perm, dim*sizeof(int));
llvm_visc_track_mem(h_x_vector, dim*sizeof(float));
llvm_visc_track_mem(h_ptr, depth*sizeof(int));
llvm_visc_track_mem(h_nzcnt, nzcnt_len*sizeof(int));
//main execution
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
int i;
for(i=0; i<50; i++)
{
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
void* spmvDFG = __visc__node(spmv_jds, 2, 1, (unsigned)block, (unsigned)(grid/block),
15,
h_Ax_vector,
dim*sizeof(float),
h_data,
len*sizeof(float),
h_indices,
len*sizeof(int),
h_perm,
dim*sizeof(int),
h_x_vector,
dim*sizeof(float),
dim,
h_ptr,
depth*sizeof(int),
h_nzcnt,
nzcnt_len*sizeof(int),
0);
__visc__wait(spmvDFG);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/******************************* Issues *******************************
* 1. Using OpenCL to compute grid and block dimensions
* (getting device info)
* We need to check the GPU version (major number) where this kernel
* executes to compare against opencl_nvidia version
* 2. Type of cl_mem buffer for d_x_vector is created with size of float,
but copied in through size of int.
Due to type of h_x_vector, I chose to use float
* (Minor)
* 3. Kernel initially used constant memory for last two arguments - removed
*/
}
//HtoD memory copy
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
llvm_visc_untrack_mem(h_Ax_vector);
llvm_visc_untrack_mem(h_data);
llvm_visc_untrack_mem(h_indices);
llvm_visc_untrack_mem(h_perm);
llvm_visc_untrack_mem(h_x_vector);
llvm_visc_untrack_mem(h_ptr);
llvm_visc_untrack_mem(h_nzcnt);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__visc__cleanup();
if (parameters->outFile) {
/*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
outputData(parameters->outFile,h_Ax_vector,dim);
}
free (h_data);
free (h_indices);
free (h_ptr);
free (h_perm);
free (h_nzcnt);
free (h_Ax_vector);
free (h_x_vector);
pb_FreeParameters(parameters);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment