VISC version of SPMV parboil benchmark.

95d31417 · Maria Kotsifakou · b9a8b0b0 · 95d31417 · 95d31417 · 95d31417
Commit 95d31417 authored 10 years ago by Maria Kotsifakou
--- a/llvm/test/VISC/parboil/benchmarks/spmv/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/Makefile
+PARBOIL_ROOT = /home/kotsifa2/visc/llvm/test/VISC/parboil
+APP = spmv
+
+# Default compile visc
+ifeq ($(VERSION),)
+  VERSION = visc
+endif
+
+# Default use small test case
+ifeq ($(TEST),)
+  TEST = small
+endif
+
+BIN = $(addsuffix -$(VERSION), $(APP))
+
+SRCDIR = src/$(VERSION)
+BUILDDIR = build/$(VERSION)
+DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
+
+ifeq ($(TEST),small)
+  INPUT1 = $(DATASET_DIR)/small/input/1138_bus.mtx
+  INPUT2 = $(DATASET_DIR)/small/input/vector.bin
+  REF_OUTPUT = $(DATASET_DIR)/small/output/1138_bus.mtx.out
+  RUNDIR = run/$(VERSION)/small
+  OUTPUT = $(RUNDIR)/1138_bus.mtx.out
+else ifeq ($(TEST),medium)
+  INPUT1 = $(DATASET_DIR)/medium/input/bcsstk18.mtx
+  INPUT2 = $(DATASET_DIR)/medium/input/vector.bin
+  REF_OUTPUT = $(DATASET_DIR)/medium/output/bcsstk18.mtx.out
+  RUNDIR = run/$(VERSION)/medium
+  OUTPUT = $(RUNDIR)/bcsstk18.mtx.out
+else
+  INPUT1 = $(DATASET_DIR)/large/input/Dubcova3.mtx.bin
+  INPUT2 = $(DATASET_DIR)/large/input/vector.bin
+  REF_OUTPUT = $(DATASET_DIR)/large/output/Dubcova3.mtx.out
+  RUNDIR = run/$(VERSION)/large
+  OUTPUT = $(RUNDIR)/Dubcova3.mtx.out
+endif
+
+ARGS = -i $(INPUT1) $(INPUT2) -o $(OUTPUT)
+TOOL = tools/compare-output
+
+include $(PARBOIL_ROOT)/common/mk/Makefile
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/Makefile
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+TOOLS_SRC=common_src/convert-dataset
+SRCDIR_OBJS=main.visc.ll gpu_info.ll file.ll
+APP_CUDALDFLAGS=-lm
+APP_CFLAGS=-ffast-math -g3 -O3 -I$(TOOLS_SRC)
+APP_CXXFLAGS=-ffast-math -g3 -O3
+
+include $(TOOLS_SRC)/commontools.mk
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/file.c
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/file.c
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2007 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <endian.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+# error "File I/O is not implemented for this system: wrong endianness."
+#endif
+
+void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
+               float** h_data, int** h_indices, int** h_ptr,
+               int** h_perm, int** h_nzcnt)
+{
+  FILE* fid = fopen(fName, "rb");
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open input file\n");
+      exit(-1);
+    }
+
+  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
+  int _len=len[0];
+  int _depth=depth[0];
+  int _dim=dim[0];
+  int _pad=pad[0];
+  int _nzcnt_len=nzcnt_len[0];
+  
+  *h_data = (float *) malloc(_len * sizeof (float));
+  fread (*h_data, sizeof (float), _len, fid);
+  
+  *h_indices = (int *) malloc(_len * sizeof (int));
+  fread (*h_indices, sizeof (int), _len, fid);
+  
+  *h_ptr = (int *) malloc(_depth * sizeof (int));
+  fread (*h_ptr, sizeof (int), _depth, fid);
+  
+  *h_perm = (int *) malloc(_dim * sizeof (int));
+  fread (*h_perm, sizeof (int), _dim, fid);
+  
+  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
+  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+
+  fclose (fid); 
+}
+
+void input_vec(char *fName,float *h_vec,int dim)
+{
+  FILE* fid = fopen(fName, "rb");
+  fread (h_vec, sizeof (float), dim, fid);
+  fclose(fid);
+  
+}
+
+void outputData(char* fName, float *h_Ax_vector,int dim)
+{
+  FILE* fid = fopen(fName, "w");
+  uint32_t tmp32;
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open output file\n");
+      exit(-1);
+    }
+  tmp32 = dim;
+  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
+  fwrite(h_Ax_vector, sizeof(float), dim, fid);
+
+  fclose (fid);
+}
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/file.h
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/file.h
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2007 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+#ifndef __FILEH__
+#define __FILEH__
+
+void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
+               float** h_data, int** h_indices, int** h_ptr,
+               int** h_perm, int** h_nzcnt);
+
+void input_vec(char* fNanme, float *h_vec,int dim);
+void outputData(char* fName, float *h_Ax_vector,int dim);
+
+#endif
\ No newline at end of file
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/gpu_info.c
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/gpu_info.c
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+#include <endian.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "gpu_info.h"
+
+void compute_active_thread(size_t *thread,
+			   size_t *grid,
+			   int task,
+			   int pad,
+			   int major,
+			   int minor,
+			   int sm)
+{
+	int max_thread;
+	int max_block=8;
+	if(major==1)
+	{
+		if(minor>=2)
+			max_thread=1024;
+		else
+			max_thread=768;
+	}
+	else if(major==2)
+		max_thread=1536;
+	else
+		//newer GPU  //keep using 2.0
+		max_thread=1536;
+	
+	int _grid;
+	int _thread;
+	
+	if(task*pad>sm*max_thread)
+	{
+		_thread=max_thread/max_block;
+		_grid = ((task*pad+_thread-1)/_thread)*_thread;
+	}
+	else
+	{
+		_thread=pad;
+		_grid=task*pad;
+	}
+
+	thread[0]=_thread;
+	grid[0]=_grid;
+}
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/gpu_info.h
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/gpu_info.h
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef __GPUINFOH__
+#define __GPUINFOH__
+
+void compute_active_thread(size_t *thread,
+			   size_t *grid,
+			   int task,
+			   int pad,
+			   int major,
+			   int minor,
+			   int sm);
+
+#endif
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/kernel.cl
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/kernel.cl
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+#define WARP_BITS 5
+
+__kernel void spmv_jds(__global float *dst_vector, __global float *d_data,
+		       __global int *d_index, __global int *d_perm,
+		       __global float *x_vec, const int dim, 
+		       __constant int *jds_ptr_int,
+		       __constant int *sh_zcnt_int)
+{
+	int ix = get_global_id(0);
+	int warp_id=ix>>WARP_BITS;
+
+	if(ix<dim)
+	{
+		float sum=0.0f;
+		int bound=sh_zcnt_int[warp_id];
+		//prefetch 0
+		int j=jds_ptr_int[0]+ix;  
+		float d = d_data[j]; 
+		int i = d_index[j];  
+		float t = x_vec[i];
+		
+		if (bound>1)  //bound >=2
+		{
+			//prefetch 1
+			j=jds_ptr_int[1]+ix;    
+			i =  d_index[j];  
+			int in;
+			float dn;
+			float tn;
+			for(int k=2;k<bound;k++ )
+			{	
+				//prefetch k-1
+				dn = d_data[j]; 
+				//prefetch k
+				j=jds_ptr_int[k]+ix;    
+				in = d_index[j]; 
+				//prefetch k-1
+				tn = x_vec[i];
+				
+				//compute k-2
+				sum += d*t; 
+				//sweep to k
+				i = in;  
+				//sweep to k-1
+				d = dn;
+				t =tn; 
+			}	
+		
+			//fetch last
+			dn = d_data[j];
+			tn = x_vec[i];
+	
+			//compute last-1
+			sum += d*t; 
+			//sweep to last
+			d=dn;
+			t=tn;
+		}
+		//compute last
+		sum += d*t;  // 3 3
+		
+		//write out data
+		dst_vector[d_perm[ix]]=sum; 
+	}
+}
+
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+//#include <CL/cl.h>
+//#include <CL/cl_ext.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parboil.h>
+
+#include "file.h"
+#include "gpu_info.h"
+#include "convert_dataset.h"
+
+extern "C" void __visc__attributes(...);
+extern "C" void __visc__wait(...);
+extern "C" unsigned __visc__node(...);
+extern "C" int get_global_id(int);
+
+static int generate_vector(float *x_vector, int dim) 
+{	
+	srand(54321);
+	int i;
+	for(i=0;i<dim;i++)
+	{
+		x_vector[i] = (rand() / (float) RAND_MAX);
+	}
+	return 0;
+}
+
+__kernel void spmv_jds(float *dst_vector, float *d_data,
+		               int *d_index, int *d_perm,
+		               float *x_vec, int dim, 
+		               int *jds_ptr_int,
+		               int *sh_zcnt_int)
+{
+    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
+                       1, dst_vector);
+
+	int ix = get_global_id(0);
+	int warp_id=ix>>WARP_BITS;
+
+	if(ix<dim)
+	{
+		float sum=0.0f;
+		int bound=sh_zcnt_int[warp_id];
+		//prefetch 0
+		int j=jds_ptr_int[0]+ix;  
+		float d = d_data[j]; 
+		int i = d_index[j];  
+		float t = x_vec[i];
+		
+		if (bound>1)  //bound >=2
+		{
+			//prefetch 1
+			j=jds_ptr_int[1]+ix;    
+			i =  d_index[j];  
+			int in;
+			float dn;
+			float tn;
+			for(int k=2;k<bound;k++ )
+			{	
+				//prefetch k-1
+				dn = d_data[j]; 
+				//prefetch k
+				j=jds_ptr_int[k]+ix;    
+				in = d_index[j]; 
+				//prefetch k-1
+				tn = x_vec[i];
+				
+				//compute k-2
+				sum += d*t; 
+				//sweep to k
+				i = in;  
+				//sweep to k-1
+				d = dn;
+				t =tn; 
+			}	
+		
+			//fetch last
+			dn = d_data[j];
+			tn = x_vec[i];
+	
+			//compute last-1
+			sum += d*t; 
+			//sweep to last
+			d=dn;
+			t=tn;
+		}
+		//compute last
+		sum += d*t;  // 3 3
+		
+		//write out data
+		dst_vector[d_perm[ix]]=sum; 
+	}
+}
+
+int main(int argc, char** argv) {
+	struct pb_TimerSet timers;
+	struct pb_Parameters *parameters;
+	
+	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+	printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
+	parameters = pb_ReadParameters(&argc, argv);
+	
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
+  {
+      		fprintf(stderr, "Expecting one two filenames\n");
+      		exit(-1);
+    	}
+
+	pb_InitializeTimerSet(&timers);
+	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+	//parameters declaration
+	int len;
+	int depth;
+	int dim;
+	int pad=32;
+	int nzcnt_len;
+	
+	//host memory allocation
+	//matrix
+	float *h_data;
+	int *h_indices;
+	int *h_ptr;
+	int *h_perm;
+	int *h_nzcnt;
+
+	//vector
+	float *h_Ax_vector;
+	float *h_x_vector;
+		
+    	//load matrix from files
+	pb_SwitchToTimer(&timers, pb_TimerID_IO);
+	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+	//    &h_data, &h_indices, &h_ptr,
+	//    &h_perm, &h_nzcnt);
+	int col_count;
+
+	coo_to_jds(
+		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+		1, // row padding
+		pad, // warp size
+		1, // pack size
+		1, // is mirrored?
+		0, // binary matrix
+		1, // debug level [0:2]
+		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
+		&col_count, &dim, &len, &nzcnt_len, &depth
+	);
+ 
+  
+    h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
+    h_x_vector=(float*)malloc(sizeof(float)*dim);
+    input_vec( parameters->inpFiles[1],h_x_vector,dim);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+	
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+    memset(h_Ax_vector, 0, dim*sizeof(float));
+	
+	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+	
+	size_t grid;
+	size_t block;
+    	
+	compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
+
+	//main execution
+	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+	int i;
+	for(i=0; i<50; i++)
+	{
+	unsigned spmvDFG = __visc__node(spmv_jds, 2, 1, block, grid,
+                                    15,
+                                    h_Ax_vector,
+                                    dim*sizeof(float),
+                                    h_data,
+                                    len*sizeof(float),
+                                    h_indices,
+                                    len*sizeof(int),
+                                    h_perm,
+                                    dim*sizeof(int),
+                                    h_x_vector,
+                                    dim*sizeof(float),
+                                    dim,
+                                    h_ptr,
+                                    depth*sizeof(int),
+                                    h_nzcnt,
+                                    nzcnt_len*sizeof(int),
+                                    0);
+    __visc__wait(spmvDFG);
+
+    /******************************* Issues *******************************
+     * 1. Using OpenCL to compute grid and block dimensions
+     *    (getting device info)
+     *    We need to check the GPU version (major number) where this kernel
+     *    executes to compare against opencl_nvidia version
+     * 2. Type of cl_mem buffer for d_x_vector is created with size of float,
+          but copied in through size of int.
+          Due to type of h_x_vector, I chose to use float
+     *    (Minor)
+     * 3. Kernel initially used constant memory for last two arguments - removed
+     */
+	}
+
+	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+	//HtoD memory copy
+	 
+	if (parameters->outFile) {
+		pb_SwitchToTimer(&timers, pb_TimerID_IO);
+		outputData(parameters->outFile,h_Ax_vector,dim);
+		
+	}
+	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+	
+	free (h_data);
+	free (h_indices);
+	free (h_ptr);
+	free (h_perm);
+	free (h_nzcnt);
+	free (h_Ax_vector);
+	free (h_x_vector);
+	
+	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+	pb_PrintTimerSet(&timers);
+	pb_FreeParameters(parameters);
+
+	return 0;
+}