diff --git a/README.md b/README.md index 4970eccbacb5ecd9fb30f2dcde065d43b672d6db..3f054d92b3965ec7cecec1af391c179608e1e719 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,42 @@ This repository contains miscellaneous supporting materals for HPVM. [PPoPP'18 paper](http://rsim.cs.illinois.edu/Pubs/17-PPOPP-HPVM.pdf) ## Dependencies -You would need to download and install the following components for using AVX and NVIDIA GPUs to speed up your programs +You would need to download and install the following components for using NVIDIA GPUs to speed up your programs * Intel OpenCL SDK for Linux from [software.intel.com/sdk/opencl](software.intel.com/sdk/opencl). Follow the installation instructions (no special requirements). * CUDA -## Build +## Getting source code and building HPVM -Switch to hpvm-reorg-9 +Checkout HPVM: ```shell -git checkout hpvm-reorg-9 +git clone https://gitlab.engr.illinois.edu/llvm/hpvm.git +git checkout hpvm-reorg-9 (this step may not be needed once code is mirrored on Github) ``` -Build hpvm +HPVM installer script can be used to dowwnload, configure and build HPMV along with LLVM and other subprojects including Clang. ```shell bash install.sh ``` +Specifically, the HPVM installer downloads the LLVM, Clang, compiler-rt, libcxxabi and lld, copies HPVM source into +llvm/tools and build the entire tree. LLVM C-Backend is also built as a part of HPVM and is currently used to perform +code generation in OpenCL for GPUs. -Build hpvm runtime +Alternatively, CMake can be run manually. +```shell +cd hpvm/build +cmake ../llvm [options] +``` +Some common options that can be used with CMake are: + +* -DCMAKE_INSTALL_PREFIX=directory --- Specify for directory the full pathname of where you want the HPVM tools and libraries to be installed. + +* -DCMAKE_BUILD_TYPE=type --- Valid options for type are Debug, Release, RelWithDebInfo, and MinSizeRel. Default is Debug. + +* -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types). + +## Building hpvm runtime +HPVM also includes a runtime library which comprises of low-level, target-specific wrappers required by HPVM's code generation. ```shell cd projects/visc-rt make @@ -40,14 +58,3 @@ export LLVM_SRC_ROOT=<full path to hpvm>/llvm Benchmark suites have been migrated to the LLVM 4.0 build. They are located in [VISC](/llvm/test/VISC/parboil/benchmarks). -### Running an example (sgemm in parboil) -```shell -cd llvm/test/VISC/parboil/benchmarks/sgemm -make -make run -``` - -### Other Old Components - -Search this repository for "visc", case-insensitive. - diff --git a/hpvm/test/parboil/benchmarks/bfs/Makefile b/hpvm/test/parboil/benchmarks/bfs/Makefile index 8261dab47b1466863105d5f1aafd18a538661540..cc6db678298c4c66312248cc4f7a2df0bd134d3f 100644 --- a/hpvm/test/parboil/benchmarks/bfs/Makefile +++ b/hpvm/test/parboil/benchmarks/bfs/Makefile @@ -1,4 +1,4 @@ -PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil +PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = bfs # Default compile visc diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp index 1eec80bb2c78cb86efe3c71c83d2280e95104f5a..6227ef498f10eb82e685f4dab518caf17e7757ac 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp @@ -309,7 +309,45 @@ void BFS_Root(int *q1, size_t bytesq1, } +void BFS_Wrapper( + int *q1, size_t bytesq1, // 0, 1 + int *q2, size_t bytesq2, // 2, 3 + struct Node *g_graph_nodes, size_t bytesg_graph_nodes, // 4, 5 + struct Edge *g_graph_edges, size_t bytesg_graph_edges, // 6, 7 + int *g_color, size_t bytesg_color, // 8, 9 + int *g_cost, size_t bytesg_cost, // 10, 11 + int *tail, size_t bytestail, // 12, 13 + int no_of_nodes, int gray_shade, // 14, 15 + int k, long block, long grid // 16 - 18 +) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes( + 6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + 4, q2, g_color, g_cost, tail + ); + void* BlockingBFSNode = __visc__createNodeND(0, BFS_Root); + // Bind edges + __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost + __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost + __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail + __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail + __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes + __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade + __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k + __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid +} FILE *fp; char* readFile(const char* fileName) @@ -509,7 +547,7 @@ int main( int argc, char** argv) //gray, //k, //0); - void* bfsDFG = __visc__launch(0, BFS_Root, (void*) args); + void* bfsDFG = __visc__launch(0, BFS_Wrapper, (void*) args); __visc__wait(bfsDFG); // Swap q1 and q2 // Swap q1 and q2 diff --git a/hpvm/test/parboil/benchmarks/cutcp/Makefile b/hpvm/test/parboil/benchmarks/cutcp/Makefile index d00d743bb59e5dd22abc713cad1483e316d4acb0..5e56793360aa479f604883f63b41a3ab8bb0cc58 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/Makefile +++ b/hpvm/test/parboil/benchmarks/cutcp/Makefile @@ -1,4 +1,4 @@ -PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil +PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = cutcp # Default compile visc diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp index 33dd2d68ca4c98b5ef877cfcfff3f243176c0010..c26621737c4c5979d863ccb7b42a8d4132f1b5c1 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp @@ -50,6 +50,8 @@ typedef struct __attribute__((__packed__)) __attribute__((aligned(16))){ float w; } float4; +extern float rsqrt(float x); + void Allocation(long block) { // Memory shared between threadblocks size_t bytes_AtomBinCache = sizeof(float)*BIN_CACHE_MAXLEN * BIN_DEPTH * 4; @@ -121,9 +123,9 @@ void CUTCPLeaf( int numbins; /* bin number determined by center of region */ - myBinIndex[0] = (int) __visc__floor((8 * xRegionIndex + 4) * h * BIN_INVLEN); - myBinIndex[1] = (int) __visc__floor((8 * yRegionIndex + 4) * h * BIN_INVLEN); - myBinIndex[2] = (int) __visc__floor((8 * zRegionIndex + 4) * h * BIN_INVLEN); + myBinIndex[0] = (int) floor((8 * xRegionIndex + 4) * h * BIN_INVLEN); + myBinIndex[1] = (int) floor((8 * yRegionIndex + 4) * h * BIN_INVLEN); + myBinIndex[2] = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN); /* first neighbor in list for me to cache */ nbrid = (tid >> 4); @@ -194,7 +196,7 @@ void CUTCPLeaf( if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); - energy0 += aq * __visc__rsqrt(r2) * s * s; + energy0 += aq * rsqrt(r2) * s * s; //energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else @@ -207,7 +209,7 @@ void CUTCPLeaf( if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); - energy1 += aq * __visc__rsqrt(r2) * s * s; + energy1 += aq * rsqrt(r2) * s * s; //energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else @@ -219,7 +221,7 @@ void CUTCPLeaf( if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); - energy2 += aq * __visc__rsqrt(r2) * s * s; + energy2 += aq * rsqrt(r2) * s * s; //energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else @@ -231,8 +233,8 @@ void CUTCPLeaf( if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); - energy3 += aq * __visc__rsqrt(r2) * s * s; - //energy3 += aq * (1.0/__visc__rsqrt(r2)) * s * s; + energy3 += aq * rsqrt(r2) * s * s; + //energy3 += aq * (1.0/rsqrt(r2)) * s * s; } #else energy3 += (r2 < cutoff2); @@ -418,6 +420,55 @@ void CUTCPRoot( } +void CUTCPWrapper( + int binDim_x, + int binDim_y, + float4 *binBaseAddr, size_t bytes_binBaseAddr, + int offset, + float h, /* lattice spacing */ + float cutoff2, /* square of cutoff distance */ + float inv_cutoff2, + ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */ + int zRegionIndex, + // constant memory arguments the next two + int *NbrListLen, size_t bytes_NbrListLen, + xyz *NbrList, size_t bytes_NbrList, + long blockx, + long blocky, + long blockz, + long gridx, + long gridy, + long gridz +) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); + + void* BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot); + + // Bind Inputs + __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx + __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy + __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz +} + // ==================== Host Code ============================== int gpu_compute_cutoff_potential_lattice6overlap( @@ -938,7 +989,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( args->zRegionIndex = zRegionIndex; - CUTCP_DFG = __visc__launch(0, CUTCPRoot, (void*)args); + CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void*)args); __visc__wait(CUTCP_DFG); //llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); } diff --git a/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c b/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c index e43bbb4f25c4c97c9907ebae37251c854860c3b5..c1c0130b4c2c0ec6ec7e792c72323b03a4d508a5 100644 --- a/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c +++ b/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c @@ -17,6 +17,11 @@ int NUM_CLASSES; int INPUT_DIM; int NUM_WORKER_THREADS; +// Type of struct holding the return value from the last node. +struct RetStruct { + size_t bytesRet; +}; + // Type of struct that is used to pass arguments to the HPVM dataflow graph // using the hpvm launch operation typedef struct __attribute__((__packed__)) { @@ -34,7 +39,8 @@ typedef struct __attribute__((__packed__)) { float*coefs; size_t bytes_coefs; float *l2_dist; size_t bytes_l2_dist; float *tone_map; size_t bytes_tone_map; - size_t row_size; size_t col_size; + int row_size; int col_size; + struct RetStruct ret; // Instance of RetStruct holding the return value. } RootIn; @@ -807,6 +813,10 @@ int main(int argc, char* argv[]) { __visc__wait(camPipeDFG); printf("\n\nPipeline execution completed!\n"); + printf( + "Pipeline final stage returned %lu; should be %lu\n", + rootArgs->ret.bytesRet, bytes_image + ); printf("\n\nRequesting memory!\n"); // Request data from graph. diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc index 161ec4505707e050ed8700a700e44c9a882049e7..16f2341a2203e3510b9c00a91eedd3ac53d296d4 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc @@ -187,19 +187,13 @@ void SgemmTB(float *A, size_t bytesA, } // Root node for sgemm - Creates work group node -void SgemmRoot(float *A, size_t bytesA, - int lda, - float *B, size_t bytesB, - int ldb, - float *C, size_t bytesC, - int ldc, - int k, - float alpha, - float beta, - long block_x, - long block_y, - long grid_x, - long grid_y) { +void SgemmRoot( + float *A, size_t bytesA, int lda, // 0-2 + float *B, size_t bytesB, int ldb, // 3-5 + float *C, size_t bytesC, int ldc, // 6-8 + int k, float alpha, float beta, // 9-11 + long block_x, long block_y, long grid_x, long grid_y // 12-15 +) { __visc__hint(visc::CPU_TARGET); __visc__attributes(3, A, B, C, 1, C); void* SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y); @@ -222,6 +216,36 @@ void SgemmRoot(float *A, size_t bytesA, } +void SgemmWrapper( + float *A, size_t bytesA, int lda, // 0-2 + float *B, size_t bytesB, int ldb, // 3-5 + float *C, size_t bytesC, int ldc, // 6-8 + int k, float alpha, float beta, // 9-11 + long block_x, long block_y, long grid_x, long grid_y // 12-15 +) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void* SgemmRootNode = __visc__createNodeND(0, SgemmRoot); + + // Bind edges + __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y + __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x + __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y +} + // Creates root node for sgemm __attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc ) { @@ -269,7 +293,7 @@ __attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char trans ); pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION ); - void* sgemmDFG = __visc__launch(0, SgemmRoot, (void*) args); + void* sgemmDFG = __visc__launch(0, SgemmWrapper, (void*) args); __visc__wait(sgemmDFG); pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); diff --git a/hpvm/test/parboil/benchmarks/tpacf/Makefile b/hpvm/test/parboil/benchmarks/tpacf/Makefile index 0325de5b15be8dc682e8ef472d2b9a84e9a3729e..6140acd5ac3a196c8750b997c2e5904ba9585839 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/Makefile +++ b/hpvm/test/parboil/benchmarks/tpacf/Makefile @@ -1,4 +1,4 @@ -PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil +PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = tpacf # Default compile visc diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc index be44d864409cb3f3b8f4799df0387441ca89785d..d1482d732947aefc2f3eafb380f584680e692f7f 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc @@ -232,10 +232,7 @@ void BlockingTPACF(hist_t* histograms, size_t bytes_histograms, // next arg is read-only constant float* binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, - long block, - // shared memory args - struct cartesian* data_s, size_t bytes_data_s, - unsigned int* warp_hists, size_t bytes_warp_hists) { + long block) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, all_x_data, binb, 1, histograms); @@ -286,6 +283,32 @@ void TPACFRoot(hist_t* histograms, size_t bytes_histograms, } +void TPACFWrapper( + hist_t* histograms, size_t bytes_histograms, + float* all_x_data, size_t bytes_all_data, + // next arg is read-only constant + float* binb, size_t bytes_binb, + int NUM_SETS, int NUM_ELEMENTS, + long block, long grid +) { + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, all_x_data, binb, 1, histograms); + + void* BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot); + + // Bind Inputs + __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms + __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms + __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data + __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data + __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb + __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb + __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS + __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS + __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block + __visc__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid +} + // **===-----------------------------------------------------------===** int