Skip to content
Snippets Groups Projects
Commit ab566e34 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

Ported bfs to cpu baseline

parent 954a12f1
No related branches found
No related tags found
No related merge requests found
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=opencl
SRCDIR_OBJS=main.o OpenCL_common.o
#include "OpenCL_common.h"
#include <string.h>
// -1 for NO suitable device found, 0 if an appropriate device was found
int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
// Supported Device Requests (anything that returns cl_bool)
// CL_DEVICE_IMAGE_SUPPORT
// CL_DEVICE_HOST_UNIFIED_MEMORY
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
// CL_DEVICE_AVAILABLE
// CL_DEVICE_COMPILER_AVAILABLE
cl_uint numEntries = 16;
cl_platform_id clPlatforms[numEntries];
cl_uint numPlatforms;
cl_device_id clDevices[numEntries];
cl_uint numDevices;
OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
bool needDevice = true;
for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
cl_platform_id clPlatform = clPlatforms[ip];
OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices);
for (int id = 0; (id < numDevices) && needDevice ; ++id) {
cl_device_id clDevice = clDevices[id];
cl_device_type clDeviceType;
bool canSatisfy = true;
if (reqDeviceType != NULL) {
OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
if (*reqDeviceType != clDeviceType) {
canSatisfy = false;
}
}
}
va_list paramList;
va_start(paramList, numRequests);
for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
cl_device_info devReq = va_arg( paramList, cl_device_info );
cl_bool clInfoBool;
size_t infoRetSize = sizeof(cl_bool);
OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
if (clInfoBool != true) {
canSatisfy = false;
}
}
va_end(paramList);
if (canSatisfy) {
*device = clDevice;
*platform = clPlatform;
needDevice = false;
fprintf(stderr, "Chose Device Type: %s\n",
(clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
);
if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
*reqDeviceType = clDeviceType;
}
}
} // End checking all devices for a platform
} // End checking all platforms
int retVal = -1;
if (needDevice) {
retVal = -1;
} else {
retVal = 0;
}
return retVal;
}
const char* oclErrorString(cl_int error)
{
// From NVIDIA SDK
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "";
}
const char* oclDebugErrString(cl_int error, cl_device_id device)
{
// From NVIDIA SDK
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
if (index == 4) {
cl_uint maxMemAlloc = 0;
OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc);
}
return (index >= 0 && index < errorCount) ? errorString[index] : "";
}
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
#ifndef __OPENCL_COMMON_H_
#define __OPENCL_COMMON_H_
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <CL/cl.h>
int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
const char* oclErrorString(cl_int error);
const char* oclDebugErrString(cl_int error, cl_device_id device);
#define OCL_ERRCK_VAR(var) \
{ if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }
#define OCL_ERRCK_RETVAL(s) \
{ cl_int clerr = (s);\
if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
#define OCL_SIMPLE_ERRCK_RETVAL(s) \
{ cl_int clerr = (s);\
if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
#endif
#define MAX_THREADS_PER_BLOCK 256
#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees
#define INF 2147483647//2^31-1
#define UP_LIMIT 16677216//2^24
#define WHITE 16677217
#define GRAY 16677218
#define GRAY0 16677219
#define GRAY1 16677220
#define BLACK 16677221
struct Node {
int x;
int y;
};
struct Edge {
int x;
int y;
};
; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/bfs/src/opencl_base/kernel.cl'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
target triple = "spir64-unknown-unknown"
%struct.Node = type { i32, i32 }
%struct.Edge = type { i32, i32 }
define cc76 void @BFS_kernel(i32 addrspace(1)* nocapture %q1, i32 addrspace(1)* nocapture %q2, %struct.Node addrspace(1)* nocapture %g_graph_nodes, %struct.Edge addrspace(1)* nocapture %g_graph_edges, i32 addrspace(1)* %g_color, i32 addrspace(1)* %g_cost, i32 addrspace(1)* %tail, i32 %no_of_nodes, i32 %gray_shade, i32 %k, i32 addrspace(3)* %local_q_tail, i32 addrspace(3)* nocapture %local_q, i32 addrspace(3)* nocapture %shift) nounwind {
%1 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
%2 = icmp eq i64 %1, 0
br i1 %2, label %3, label %4
; <label>:3 ; preds = %0
store i32 0, i32 addrspace(3)* %local_q_tail, align 4, !tbaa !9
br label %4
; <label>:4 ; preds = %3, %0
tail call cc75 void @_Z7barrierj(i32 1) nounwind
%5 = tail call cc75 i64 @_Z13get_global_idj(i32 0) nounwind readnone
%6 = trunc i64 %5 to i32
%7 = icmp slt i32 %6, %no_of_nodes
br i1 %7, label %8, label %.loopexit
; <label>:8 ; preds = %4
%9 = sext i32 %6 to i64
%10 = getelementptr inbounds i32 addrspace(1)* %q1, i64 %9
%11 = load i32 addrspace(1)* %10, align 4, !tbaa !9
%12 = sext i32 %11 to i64
%13 = getelementptr inbounds i32 addrspace(1)* %g_color, i64 %12
store i32 16677221, i32 addrspace(1)* %13, align 4, !tbaa !9
%14 = getelementptr inbounds i32 addrspace(1)* %g_cost, i64 %12
%15 = load i32 addrspace(1)* %14, align 4, !tbaa !9
%16 = getelementptr inbounds %struct.Node addrspace(1)* %g_graph_nodes, i64 %12
%17 = bitcast %struct.Node addrspace(1)* %16 to i8 addrspace(1)*
%cur_node.sroa.0.0..cast = bitcast i8 addrspace(1)* %17 to i32*
%cur_node.sroa.0.0.copyload = load i32* %cur_node.sroa.0.0..cast, align 4
%cur_node.sroa.1.4..idx = getelementptr inbounds %struct.Node addrspace(1)* %g_graph_nodes, i64 %12, i32 1
%18 = bitcast i32 addrspace(1)* %cur_node.sroa.1.4..idx to i8 addrspace(1)*
%cur_node.sroa.1.4..cast = bitcast i8 addrspace(1)* %18 to i32*
%cur_node.sroa.1.4.copyload = load i32* %cur_node.sroa.1.4..cast, align 4
%19 = add nsw i32 %cur_node.sroa.1.4.copyload, %cur_node.sroa.0.0.copyload
%20 = icmp sgt i32 %cur_node.sroa.1.4.copyload, 0
br i1 %20, label %.lr.ph42, label %.loopexit
.lr.ph42: ; preds = %8, %41
%i.040 = phi i32 [ %42, %41 ], [ %cur_node.sroa.0.0.copyload, %8 ]
%21 = sext i32 %i.040 to i64
%22 = getelementptr inbounds %struct.Edge addrspace(1)* %g_graph_edges, i64 %21
%23 = bitcast %struct.Edge addrspace(1)* %22 to i8 addrspace(1)*
%cur_edge.sroa.0.0..cast = bitcast i8 addrspace(1)* %23 to i32*
%cur_edge.sroa.0.0.copyload = load i32* %cur_edge.sroa.0.0..cast, align 4
%cur_edge.sroa.1.4..idx = getelementptr inbounds %struct.Edge addrspace(1)* %g_graph_edges, i64 %21, i32 1
%24 = bitcast i32 addrspace(1)* %cur_edge.sroa.1.4..idx to i8 addrspace(1)*
%cur_edge.sroa.1.4..cast = bitcast i8 addrspace(1)* %24 to i32*
%cur_edge.sroa.1.4.copyload = load i32* %cur_edge.sroa.1.4..cast, align 4
%25 = add nsw i32 %cur_edge.sroa.1.4.copyload, %15
%26 = sext i32 %cur_edge.sroa.0.0.copyload to i64
%27 = getelementptr inbounds i32 addrspace(1)* %g_cost, i64 %26
%28 = tail call cc75 i32 @_Z8atom_minPU3AS1ii(i32 addrspace(1)* %27, i32 %25) nounwind
%29 = icmp sgt i32 %28, %25
br i1 %29, label %30, label %41
; <label>:30 ; preds = %.lr.ph42
%31 = getelementptr inbounds i32 addrspace(1)* %g_color, i64 %26
%32 = load i32 addrspace(1)* %31, align 4, !tbaa !9
%33 = icmp sgt i32 %32, 16677216
br i1 %33, label %34, label %41
; <label>:34 ; preds = %30
%35 = tail call cc75 i32 @_Z9atom_xchgPU3AS1ii(i32 addrspace(1)* %31, i32 %gray_shade) nounwind
%36 = icmp eq i32 %35, %gray_shade
br i1 %36, label %41, label %37
; <label>:37 ; preds = %34
%38 = tail call cc75 i32 @_Z8atom_addPU3AS3ii(i32 addrspace(3)* %local_q_tail, i32 1) nounwind
%39 = sext i32 %38 to i64
%40 = getelementptr inbounds i32 addrspace(3)* %local_q, i64 %39
store i32 %cur_edge.sroa.0.0.copyload, i32 addrspace(3)* %40, align 4, !tbaa !9
br label %41
; <label>:41 ; preds = %34, %.lr.ph42, %37, %30
%42 = add nsw i32 %i.040, 1
%43 = icmp slt i32 %42, %19
br i1 %43, label %.lr.ph42, label %.loopexit
.loopexit: ; preds = %8, %41, %4
tail call cc75 void @_Z7barrierj(i32 1) nounwind
br i1 %2, label %44, label %47
; <label>:44 ; preds = %.loopexit
%45 = load i32 addrspace(3)* %local_q_tail, align 4, !tbaa !9
%46 = tail call cc75 i32 @_Z8atom_addPU3AS1ii(i32 addrspace(1)* %tail, i32 %45) nounwind
store i32 %46, i32 addrspace(3)* %shift, align 4, !tbaa !9
br label %47
; <label>:47 ; preds = %44, %.loopexit
tail call cc75 void @_Z7barrierj(i32 1) nounwind
%extract.t38 = trunc i64 %1 to i32
%48 = load i32 addrspace(3)* %local_q_tail, align 4, !tbaa !9
%49 = icmp slt i32 %extract.t38, %48
br i1 %49, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %47
%50 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
br label %51
; <label>:51 ; preds = %.lr.ph, %51
%extract.t39 = phi i32 [ %extract.t38, %.lr.ph ], [ %extract.t, %51 ]
%52 = sext i32 %extract.t39 to i64
%53 = getelementptr inbounds i32 addrspace(3)* %local_q, i64 %52
%54 = load i32 addrspace(3)* %53, align 4, !tbaa !9
%55 = load i32 addrspace(3)* %shift, align 4, !tbaa !9
%56 = add nsw i32 %55, %extract.t39
%57 = sext i32 %56 to i64
%58 = getelementptr inbounds i32 addrspace(1)* %q2, i64 %57
store i32 %54, i32 addrspace(1)* %58, align 4, !tbaa !9
%59 = add i64 %50, %52
%extract.t = trunc i64 %59 to i32
%60 = load i32 addrspace(3)* %local_q_tail, align 4, !tbaa !9
%61 = icmp slt i32 %extract.t, %60
br i1 %61, label %51, label %._crit_edge
._crit_edge: ; preds = %51, %47
ret void
}
declare cc75 i64 @_Z12get_local_idj(i32) nounwind readnone
declare cc75 void @_Z7barrierj(i32)
declare cc75 i64 @_Z13get_global_idj(i32) nounwind readnone
declare cc75 i32 @_Z8atom_minPU3AS1ii(i32 addrspace(1)*, i32)
declare cc75 i32 @_Z9atom_xchgPU3AS1ii(i32 addrspace(1)*, i32)
declare cc75 i32 @_Z8atom_addPU3AS3ii(i32 addrspace(3)*, i32)
declare cc75 i32 @_Z8atom_addPU3AS1ii(i32 addrspace(1)*, i32)
declare cc75 i64 @_Z14get_local_sizej(i32) nounwind readnone
!opencl.kernels = !{!0}
!opencl.enable.FP_CONTRACT = !{}
!opencl.spir.version = !{!7}
!opencl.ocl.version = !{!7}
!opencl.used.extensions = !{!8}
!opencl.used.optional.core.features = !{!8}
!opencl.compiler.options = !{!8}
!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, %struct.Node addrspace(1)*, %struct.Edge addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, i32, i32 addrspace(3)*, i32 addrspace(3)*, i32 addrspace(3)*)* @BFS_kernel, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6}
!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 3, i32 3, i32 3}
!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none"}
!3 = metadata !{metadata !"kernel_arg_type", metadata !"int*", metadata !"int*", metadata !"struct Node*", metadata !"struct Edge*", metadata !"int*", metadata !"int*", metadata !"int*", metadata !"int", metadata !"int", metadata !"int", metadata !"int*", metadata !"int*", metadata !"int*"}
!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !""}
!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"int*", metadata !"int*", metadata !"struct Node*", metadata !"struct Edge*", metadata !"int*", metadata !"int*", metadata !"int*", metadata !"int", metadata !"int", metadata !"int", metadata !"int*", metadata !"int*", metadata !"int*"}
!6 = metadata !{metadata !"kernel_arg_name", metadata !"q1", metadata !"q2", metadata !"g_graph_nodes", metadata !"g_graph_edges", metadata !"g_color", metadata !"g_cost", metadata !"tail", metadata !"no_of_nodes", metadata !"gray_shade", metadata !"k", metadata !"local_q_tail", metadata !"local_q", metadata !"shift"}
!7 = metadata !{i32 1, i32 2}
!8 = metadata !{}
!9 = metadata !{metadata !"int", metadata !10}
!10 = metadata !{metadata !"omnipotent char", metadata !11}
!11 = metadata !{metadata !"Simple C/C++ TBAA"}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
Implementing Breadth first search on CUDA using algorithm given in DAC'10
paper "An Effective GPU Implementation of Breadth-First Search"
Copyright (c) 2010 University of Illinois at Urbana-Champaign.
All rights reserved.
Permission to use, copy, modify and distribute this software and its documentation for
educational purpose is hereby granted without fee, provided that the above copyright
notice and this permission notice appear in all copies of this software and that you do
not sell the software.
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
OTHERWISE.
Author: Lijiuan Luo (lluo3@uiuc.edu)
Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
*/
/**********
Define colors for BFS
1) the definition of White, gray and black comes from the text book "Introduction to Algorithms"
2) For path search problems, people may choose to use different colors to record the found paths.
Therefore we reserve numbers (0-16677216) for this purpose. Only nodes with colors bigger than
UP_LIMIT are free to visit
3) We define two gray shades to differentiate between the new frontier nodes and the old frontier nodes that
have not been marked BLACK
*************/
#include "config.h"
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
/*****************************************************************************
This is the most general version of BFS kernel, i.e. no assumption about #block in the grid
\param q1: the array to hold the current frontier
\param q2: the array to hold the new frontier
\param g_graph_nodes: the nodes in the input graph
\param g_graph_edges: the edges i nthe input graph
\param g_color: the colors of nodes
\param g_cost: the costs of nodes
\param no_of_nodes: the number of nodes in the current frontier
\param tail: pointer to the location of the tail of the new frontier. *tail is the size of the new frontier
\param gray_shade: the shade of the gray in current BFS propagation. See GRAY0, GRAY1 macro definitions for more details
\param k: the level of current propagation in the BFS tree. k= 0 for the first propagation.
***********************************************************************/
__kernel void
BFS_kernel(__global int *q1,
__global int *q2,
__global struct Node *g_graph_nodes,
__global struct Edge *g_graph_edges,
__global int *g_color,
__global int *g_cost,
__global int *tail,
int no_of_nodes,
int gray_shade,
int k ,
__local int *local_q_tail,
__local int *local_q,
__local int *shift)
{
if(get_local_id(0) == 0){
*local_q_tail = 0;//initialize the tail of w-queue
}
barrier(CLK_LOCAL_MEM_FENCE);
//first, propagate and add the new frontier elements into w-queues
//int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0);
int tid = get_global_id(0);
if( tid<no_of_nodes)
{
int pid = q1[tid]; //the current frontier node, or the parent node of the new frontier nodes
g_color[pid] = BLACK;
int cur_cost = g_cost[pid];
//into
struct Node cur_node = g_graph_nodes[pid];
for(int i=cur_node.x; i<cur_node.y + cur_node.x; i++)//visit each neighbor of the
//current frontier node.
{
struct Edge cur_edge = g_graph_edges[i];
int id = cur_edge.x;
int cost = cur_edge.y;
cost += cur_cost;
int orig_cost = atom_min (&g_cost[id],cost);
if(orig_cost > cost){//the node should be visited
if(g_color[id] > UP_LIMIT){
int old_color = atom_xchg (&g_color[id],gray_shade);
//this guarantees that only one thread will push this node
//into a queue
if(old_color != gray_shade) {
//atomic operation guarantees the correctness
//even if multiple warps are executing simultaneously
int index = atom_add (local_q_tail,1);
local_q[index] = id;
}
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0) == 0){
int tot_sum = *local_q_tail;
//the offset or "shift" of the block-level queue within the grid-level queue
//is determined by atomic operation
*shift = atom_add (tail,tot_sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
//shift within a w-queue
int local_shift = get_local_id(0);
while(local_shift < *local_q_tail){
q2[*shift + local_shift] = local_q[local_shift];
//multiple threads are copying elements at the same time,
//so we shift by multiple elements for next iteration
local_shift += get_local_size(0);
}
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
Implementing Breadth first search on CUDA using algorithm given in DAC'10
paper "An Effective GPU Implementation of Breadth-First Search"
Copyright (c) 2010 University of Illinois at Urbana-Champaign.
All rights reserved.
Permission to use, copy, modify and distribute this software and its documentation for
educational purpose is hereby granted without fee, provided that the above copyright
notice and this permission notice appear in all copies of this software and that you do
not sell the software.
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
OTHERWISE.
Author: Lijiuan Luo (lluo3@uiuc.edu)
Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <CL/cl.h>
#include "parboil.h"
#include "OpenCL_common.h"
#include "config.h"
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
FILE *fp;
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*size);
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
fclose(fp);
return buffer;
}
const int h_top = 1;
const int zero = 0;
void runGPU(int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
//the number of nodes in the graph
int num_of_nodes = 0;
//the number of edges in the graph
int num_of_edges = 0;
struct pb_Parameters *params;
struct pb_TimerSet timers;
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
{
fprintf(stderr, "Expecting one input filename\n");
exit(-1);
}
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
//Read in Graph from a file
fp = fopen(params->inpFiles[0],"r");
if(!fp)
{
printf("Error Reading graph file\n");
return 0;
}
int source;
fscanf(fp,"%d",&num_of_nodes);
// allocate host memory
struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
int *color = (int*) malloc(sizeof(int)*num_of_nodes);
int start, edgeno;
// initalize the memory
int i;
for( i = 0; i < num_of_nodes; i++)
{
fscanf(fp,"%d %d",&start,&edgeno);
h_graph_nodes[i].x = start;
h_graph_nodes[i].y = edgeno;
color[i]=WHITE;
}
//read the source node from the file
fscanf(fp,"%d",&source);
fscanf(fp,"%d",&num_of_edges);
int id,cost;
struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
for(i=0; i < num_of_edges ; i++)
{
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i].x = id;
h_graph_edges[i].y = cost;
}
if(fp)
fclose(fp);
pb_InitializeTimerSet(&timers);
// allocate mem for the result on host side
int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes);
for(i = 0; i < num_of_nodes; i++){
h_cost[i] = INF;
}
h_cost[source] = 0;
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
cl_int clStatus;
cl_device_id clDevice;
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
cl_uint numPlatforms;
clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id clPlatform[numPlatforms];
clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
CHECK_ERROR("clGetPlatformIDs")
clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
CHECK_ERROR("clGetDeviceIDs")
cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
CHECK_ERROR("clCreateContextFromType")
OCL_ERRCK_VAR(clStatus);
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
OCL_ERRCK_VAR(clStatus);
pb_SetOpenCL(&clContext, &clCommandQueue);
char *clSource;
size_t program_length;
const char *clSource_path = "src/opencl_cpu_baseline/kernel.cl";
clSource = oclLoadProgSource(clSource_path, "", &program_length);
//printf("Program Source:\n%s\n", clSource);
cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&clSource, &program_length, &clStatus);
OCL_ERRCK_VAR(clStatus);
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base");
OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL));
// Uncomment to view build log from compiler for debugging
/*
char *build_log;
size_t ret_val_size;
clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char *)malloc(ret_val_size+1);
clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
// there's no information in the reference whether the string is 0 terminated or not
build_log[ret_val_size] = '\0';
printf("%s\n", build_log );
*/
cl_kernel BFS_kernel = clCreateKernel(clProgram,"BFS_kernel",&clStatus);
OCL_ERRCK_VAR(clStatus);
//Copy the Node list to device memory
cl_mem d_graph_nodes;
d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus);
OCL_ERRCK_VAR(clStatus);
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL));
//Copy the Edge List to device Memory
cl_mem d_graph_edges;
d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus);
OCL_ERRCK_VAR(clStatus);
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL));
cl_mem d_color, d_cost, d_q1, d_q2, tail;
d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
OCL_ERRCK_VAR(clStatus);
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
printf("Starting GPU kernel\n");
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
int num_of_blocks;
int num_of_threads_per_block;
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL));
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL));
int num_t;//number of threads
int k=0;//BFS level index
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail));
do
{
OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
if(num_t == 0){//frontier is empty
break;
}
num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
size_t grid[1] = {num_of_blocks*num_of_threads_per_block};
size_t block[1] = {num_of_threads_per_block};
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL));
if(k%2 == 0){
int gray = GRAY0;
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
}
else{
int gray = GRAY1;
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1));
OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
}
OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0));
OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
k++;
} while(1);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
printf("GPU kernel done\n");
// copy result from device to host
OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes));
OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges));
OCL_ERRCK_RETVAL(clReleaseMemObject(d_color));
OCL_ERRCK_RETVAL(clReleaseMemObject(d_cost));
OCL_ERRCK_RETVAL(clReleaseMemObject(tail));
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
//Store the result into a file
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
FILE *fp = fopen(params->outFile,"w");
fprintf(fp, "%d\n", num_of_nodes);
int j = 0;
for(j=0;j<num_of_nodes;j++)
fprintf(fp,"%d %d\n",j,h_cost[j]);
fclose(fp);
// cleanup memory
free(h_graph_nodes);
free(h_graph_edges);
free(color);
free(h_cost);
pb_FreeParameters(params);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment