Skip to content
Snippets Groups Projects
Commit e10ea014 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

added spmv cpu vector version

parent 5f739cd8
No related branches found
No related tags found
No related merge requests found
Showing
with 554 additions and 2 deletions
......@@ -5,4 +5,3 @@ SRCDIR_OBJS=main.o io.o #compute_gold.o
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
KERNEL_OBJS=kernel_offline.nvptx.s
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=opencl
TOOLS_SRC=common_src/convert-dataset
SRCDIR_OBJS=main.o gpu_info.o file.o ocl.o
APP_CUDALDFLAGS=-lm
APP_CFLAGS=-ffast-math -g3 -O3 -I$(TOOLS_SRC)
APP_CXXFLAGS=-ffast-math -g3 -O3
include $(TOOLS_SRC)/commontools.mk
\ No newline at end of file
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
float** h_data, int** h_indices, int** h_ptr,
int** h_perm, int** h_nzcnt)
{
FILE* fid = fopen(fName, "rb");
if (fid == NULL)
{
fprintf(stderr, "Cannot open input file\n");
exit(-1);
}
fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
int _len=len[0];
int _depth=depth[0];
int _dim=dim[0];
int _pad=pad[0];
int _nzcnt_len=nzcnt_len[0];
*h_data = (float *) malloc(_len * sizeof (float));
fread (*h_data, sizeof (float), _len, fid);
*h_indices = (int *) malloc(_len * sizeof (int));
fread (*h_indices, sizeof (int), _len, fid);
*h_ptr = (int *) malloc(_depth * sizeof (int));
fread (*h_ptr, sizeof (int), _depth, fid);
*h_perm = (int *) malloc(_dim * sizeof (int));
fread (*h_perm, sizeof (int), _dim, fid);
*h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
fclose (fid);
}
void input_vec(char *fName,float *h_vec,int dim)
{
FILE* fid = fopen(fName, "rb");
fread (h_vec, sizeof (float), dim, fid);
fclose(fid);
}
void outputData(char* fName, float *h_Ax_vector,int dim)
{
FILE* fid = fopen(fName, "w");
uint32_t tmp32;
if (fid == NULL)
{
fprintf(stderr, "Cannot open output file\n");
exit(-1);
}
tmp32 = dim;
fwrite(&tmp32, sizeof(uint32_t), 1, fid);
fwrite(h_Ax_vector, sizeof(float), dim, fid);
fclose (fid);
}
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __FILEH__
#define __FILEH__
void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
float** h_data, int** h_indices, int** h_ptr,
int** h_perm, int** h_nzcnt);
void input_vec(char* fNanme, float *h_vec,int dim);
void outputData(char* fName, float *h_Ax_vector,int dim);
#endif
\ No newline at end of file
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#endif
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
__kernel void spmv_jds_naive(__global float *dst_vector, __global float *d_data,
__global int *d_index, __global int *d_perm,
__global float *x_vec, const int dim,
__constant int *jds_ptr_int,
__constant int *sh_zcnt_int)
{
int ix = get_global_id(0);
if (ix < dim) {
float sum = 0.0f;
// 32 is warp size
int bound=sh_zcnt_int[ix/32];
for(int k=0;k<bound;k++)
{
int j = jds_ptr_int[k] + ix;
int in = d_index[j];
float d = d_data[j];
float t = x_vec[in];
sum += d*t;
}
dst_vector[d_perm[ix]] = sum;
}
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <parboil.h>
#include "file.h"
#include "gpu_info.h"
#include "ocl.h"
#include "convert_dataset.h"
static int generate_vector(float *x_vector, int dim)
{
srand(54321);
int i;
for(i=0;i<dim;i++)
{
x_vector[i] = (rand() / (float) RAND_MAX);
}
return 0;
}
int main(int argc, char** argv) {
struct pb_TimerSet timers;
struct pb_Parameters *parameters;
printf("CUDA accelerated sparse matrix vector multiplication****\n");
printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
printf("This version maintained by Chris Rodrigues ***********\n");
parameters = pb_ReadParameters(&argc, argv);
if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
{
fprintf(stderr, "Expecting one input filename\n");
exit(-1);
}
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
//parameters declaration
cl_int clStatus;
cl_uint numPlatforms;
clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
CHECK_ERROR("clGetPlatformIDs")
cl_platform_id clPlatform[numPlatforms];
clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
CHECK_ERROR("clGetPlatformIDs")
cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
cl_device_id clDevice;
clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
CHECK_ERROR("clGetDeviceIDs")
cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
CHECK_ERROR("clCreateContextFromType")
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"");
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);
CHECK_ERROR("clCreateKernel")
int len;
int depth;
int dim;
int pad=32;
int nzcnt_len;
//host memory allocation
//matrix
float *h_data;
int *h_indices;
int *h_ptr;
int *h_perm;
int *h_nzcnt;
//vector
float *h_Ax_vector;
float *h_x_vector;
//device memory allocation
//matrix
cl_mem d_data;
cl_mem d_indices;
cl_mem d_ptr;
cl_mem d_perm;
cl_mem d_nzcnt;
//vector
cl_mem d_Ax_vector;
cl_mem d_x_vector;
cl_mem jds_ptr_int;
cl_mem sh_zcnt_int;
//load matrix from files
pb_SwitchToTimer(&timers, pb_TimerID_IO);
//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
// &h_data, &h_indices, &h_ptr,
// &h_perm, &h_nzcnt);
int col_count;
coo_to_jds(
parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
1, // row padding
pad, // warp size
1, // pack size
1, // is mirrored?
0, // binary matrix
1, // debug level [0:2]
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
&col_count, &dim, &len, &nzcnt_len, &depth
);
// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
h_Ax_vector=(float*)malloc(sizeof(float)*dim);
h_x_vector=(float*)malloc(sizeof(float)*dim);
input_vec( parameters->inpFiles[1],h_x_vector,dim);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
OpenCLDeviceProp clDeviceProp;
// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
//CHECK_ERROR("clGetDeviceInfo")
// clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
// CHECK_ERROR("clGetDeviceInfo")
clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
CHECK_ERROR("clGetDeviceInfo")
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
//memory allocation
d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
//memory copy
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
size_t grid;
size_t block;
compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
// printf("!!! dim is %d\n",dim);
clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
CHECK_ERROR("clSetKernelArg")
//main execution
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
int i;
for (i=0; i<50; i++)
{
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
}
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
//HtoD memory copy
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseMemObject(d_data);
clStatus = clReleaseMemObject(d_indices);
clStatus = clReleaseMemObject(d_perm);
clStatus = clReleaseMemObject(d_nzcnt);
clStatus = clReleaseMemObject(d_x_vector);
clStatus = clReleaseMemObject(d_Ax_vector);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
if (parameters->outFile) {
pb_SwitchToTimer(&timers, pb_TimerID_IO);
outputData(parameters->outFile,h_Ax_vector,dim);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
free((void*)clSource[0]);
free (h_data);
free (h_indices);
free (h_ptr);
free (h_perm);
free (h_nzcnt);
free (h_Ax_vector);
free (h_x_vector);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters);
return 0;
}
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}
#ifndef __OCLH__
#define __OCLH__
typedef struct {
cl_uint major;
cl_uint minor;
cl_uint multiProcessorCount;
} OpenCLDeviceProp;
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#endif
......@@ -5,4 +5,3 @@ SRCDIR_OBJS=main.o file.o
APP_CUDALDFLAGS=-lm
APP_CFLAGS=-ffast-math -g3 -O3
APP_CXXFLAGS=-ffast-math -g3 -O3
KERNEL_OBJS=kernel_offline.nvptx.s
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment