From c2ce9e53ed31306d95e6ae08c44a588684a5de06 Mon Sep 17 00:00:00 2001 From: Yifan Zhao <yifanz16@illinois.edu> Date: Wed, 22 Jan 2020 14:02:25 -0600 Subject: [PATCH] Revert "Merge branch 'hpvm-devisc' into 'hpvm-reorg-9'" This reverts merge request !9 --- .gitignore | 2 +- hpvm/include/BuildDFG/BuildDFG.h | 10 +- .../{GenHPVM/GenHPVM.h => GenVISC/GenVISC.h} | 22 +- hpvm/include/SupportHPVM/HPVMTimer.h | 151 - .../{SupportHPVM => SupportVISC}/DFG2LLVM.h | 118 +- .../DFGTreeTraversal.h | 0 .../{SupportHPVM => SupportVISC}/DFGraph.h | 94 +- .../HPVMHint.h => SupportVISC/VISCHint.h} | 10 +- hpvm/include/SupportVISC/VISCTimer.h | 151 + .../HPVMUtils.h => SupportVISC/VISCUtils.h} | 148 +- hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp | 68 +- hpvm/lib/Transforms/CMakeLists.txt | 2 +- hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp | 10 +- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 3139 ++++++++--------- .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp | 378 +- hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp | 894 ----- .../{GenHPVM => GenVISC}/CMakeLists.txt | 4 +- hpvm/lib/Transforms/GenVISC/GenVISC.cpp | 866 +++++ .../GenVISC.exports} | 0 .../{GenHPVM => GenVISC}/LLVMBuild.txt | 4 +- hpvm/lib/Transforms/LocalMem/LocalMem.cpp | 22 +- hpvm/llvm_installer/llvm_installer.sh | 8 +- hpvm/llvm_patches/apply_patch.sh | 2 +- hpvm/llvm_patches/include/IR/Attributes.td | 2 +- hpvm/llvm_patches/include/IR/Intrinsics.td | 2 +- .../llvm_patches/include/IR/IntrinsicsHPVM.td | 208 -- .../llvm_patches/include/IR/IntrinsicsVISC.td | 208 ++ hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp | 2 +- hpvm/llvm_patches/lib/AsmParser/LLParser.cpp | 6 +- hpvm/llvm_patches/lib/AsmParser/LLToken.h | 2 +- .../lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- .../lib/Bitcode/Writer/BitcodeWriter.cpp | 2 +- hpvm/llvm_patches/lib/IR/Attributes.cpp | 2 +- hpvm/projects/hpvm-rt/CMakeLists.txt | 22 - hpvm/projects/visc-rt/CMakeLists.txt | 22 + .../deviceStatusSwitchIntervals.txt | 0 .../{hpvm-rt => visc-rt}/device_abstraction.h | 2 +- hpvm/projects/{hpvm-rt => visc-rt}/makefile | 2 +- hpvm/projects/{hpvm-rt => visc-rt}/policy.h | 0 .../hpvm-rt.cpp => visc-rt/visc-rt.cpp} | 402 +-- .../{hpvm-rt/hpvm-rt.h => visc-rt/visc-rt.h} | 100 +- hpvm/test/CTestSuite/Makefile | 8 +- hpvm/test/CTestSuite/RUN.script | 2 +- hpvm/test/CTestSuite/gemm.c | 12 +- hpvm/test/CTestSuite/gemm_2.c | 12 +- hpvm/test/hpvm-cava/.gitignore | 2 +- hpvm/test/hpvm-cava/Makefile | 38 +- hpvm/test/hpvm-cava/Makefile.config.example | 18 +- hpvm/test/hpvm-cava/README.md | 2 +- hpvm/test/hpvm-cava/src/main.c | 1452 ++++---- hpvm/test/hpvm-cava/src/pipe_stages.c | 340 +- hpvm/test/include/hpvm.h | 73 - hpvm/test/include/visc.h | 73 + hpvm/test/parboil/RUN.parboil.script | 2 +- hpvm/test/parboil/benchmarks/bfs/Makefile | 4 +- .../src/opencl_cpu_baseline/kernel-spir64.ll | 2 +- .../bfs/src/opencl_cpu_baseline/main.cpp | 4 +- .../benchmarks/bfs/src/opencl_nvidia/main.cpp | 10 +- .../bfs/src/{hpvm => visc}/Makefile | 4 +- .../bfs/src/{hpvm => visc}/config.h | 0 .../bfs/src/{hpvm => visc}/main.cpp | 226 +- hpvm/test/parboil/benchmarks/cutcp/Makefile | 4 +- .../src/opencl_cpu_baseline/cutoff6overlap.c | 2 +- .../{kernel_hpvm.cl => kernel_visc.cl} | 0 ...{kernel_hpvm_x64.ll => kernel_visc_x64.ll} | 2 +- ...nel_hpvm_x64.spir => kernel_visc_x64.spir} | Bin .../src/opencl_cpu_baseline/kernel_x64.ll | 2 +- .../cutcp/src/opencl_nvidia/cutoff6overlap.c | 2 +- .../cutcp/src/{hpvm => visc}/Makefile | 4 +- .../cutcp/src/{hpvm => visc}/atom.h | 0 .../cutcp/src/{hpvm => visc}/cutcpu.c | 0 .../cutcp/src/{hpvm => visc}/cutoff.h | 0 .../cutcp/src/{hpvm => visc}/cutoff6overlap.c | 0 .../cutcp/src/{hpvm => visc}/excl.c | 0 .../cutcp/src/{hpvm => visc}/kernel.cl | 0 .../cutcp/src/{hpvm => visc}/macros.h | 0 .../cutcp/src/{hpvm => visc}/main.cpp | 210 +- .../benchmarks/cutcp/src/{hpvm => visc}/ocl.c | 0 .../benchmarks/cutcp/src/{hpvm => visc}/ocl.h | 0 .../cutcp/src/{hpvm => visc}/output.c | 0 .../cutcp/src/{hpvm => visc}/output.h | 0 .../cutcp/src/{hpvm => visc}/readatom.c | 0 hpvm/test/parboil/benchmarks/lbm/Makefile | 4 +- .../lbm/src/opencl_cpu_baseline/main.c | 4 +- .../benchmarks/lbm/src/opencl_cpu_long/main.c | 10 +- .../lbm/src/opencl_cpu_short/main.c | 10 +- .../benchmarks/lbm/src/opencl_nvidia/main.c | 2 +- .../lbm/src/opencl_nvidia_long/main.c | 10 +- .../lbm/src/opencl_nvidia_short/main.c | 10 +- .../lbm/src/{hpvm => visc}/Makefile | 4 +- .../lbm/src/{hpvm => visc}/layout_config.h | 0 .../benchmarks/lbm/src/{hpvm => visc}/lbm.cpp | 0 .../benchmarks/lbm/src/{hpvm => visc}/lbm.h | 0 .../lbm/src/{hpvm => visc}/lbm_macros.h | 0 .../lbm/src/{hpvm => visc}/main.cpp | 88 +- .../benchmarks/lbm/src/{hpvm => visc}/main.h | 0 hpvm/test/parboil/benchmarks/sgemm/Makefile | 4 +- .../sgemm/src/hpvm_vec_opt/Makefile | 8 - .../benchmarks/sgemm/src/opencl_base/main.cc | 4 +- .../sgemm/src/opencl_base_opt/main.cc | 4 +- .../sgemm/src/opencl_base_vec/main.cc | 4 +- .../benchmarks/sgemm/src/opencl_cpu/main.cc | 4 +- .../sgemm/src/opencl_cpu_4K/main.cc | 6 +- .../sgemm/src/opencl_cpu_baseline/main.cc | 6 +- .../sgemm/src/opencl_cpu_medium/main.cc | 6 +- .../sgemm/src/opencl_cpu_sm/kernel-spir64.ll | 2 +- .../sgemm/src/opencl_cpu_sm/main.cc | 2 +- .../sgemm/src/opencl_cpu_sm/test-spir64.ll | 2 +- .../sgemm/src/opencl_nvidia/main.cc | 2 +- .../benchmarks/sgemm/src/opencl_opt_8/main.cc | 4 +- .../sgemm/src/opencl_opt_8_4K/main.cc | 6 +- .../sgemm/src/opencl_opt_8_medium/main.cc | 6 +- .../sgemm/src/opencl_opt_8_vec/main.cc | 4 +- .../sgemm/src/{hpvm => visc}/Makefile | 4 +- .../benchmarks/sgemm/src/{hpvm => visc}/io.cc | 0 .../sgemm/src/{hpvm => visc}/kernel.cl | 0 .../sgemm/src/{hpvm => visc}/main.cc | 150 +- .../sgemm/src/{hpvm_tc => visc_opt}/Makefile | 4 +- .../sgemm/src/{hpvm_opt => visc_opt}/io.cc | 0 .../src/{hpvm_opt => visc_opt}/kernel.cl | 0 .../sgemm/src/{hpvm_opt => visc_opt}/main.cc | 30 +- .../sgemm/src/{hpvm_sh => visc_sh}/Makefile | 4 +- .../sgemm/src/{hpvm_sh => visc_sh}/io.cc | 0 .../sgemm/src/{hpvm_sh => visc_sh}/main.cc | 170 +- .../sgemm/src/{hpvm_vec => visc_tc}/Makefile | 4 +- .../sgemm/src/{hpvm_tc => visc_tc}/io.cc | 0 .../sgemm/src/{hpvm_tc => visc_tc}/main.cc | 28 +- .../src/{hpvm_tc_vec => visc_tc_vec}/Makefile | 4 +- .../src/{hpvm_tc_vec => visc_tc_vec}/io.cc | 0 .../src/{hpvm_tc_vec => visc_tc_vec}/main.cc | 28 +- .../sgemm/src/visc_tc_vec/main.visc.ll | 894 +++++ .../sgemm/src/{hpvm_opt => visc_vec}/Makefile | 4 +- .../sgemm/src/{hpvm_vec => visc_vec}/io.cc | 0 .../src/{hpvm_vec => visc_vec}/kernel.cl | 0 .../sgemm/src/{hpvm_vec => visc_vec}/main.cc | 30 +- .../sgemm/src/visc_vec/main.visc.ll | 869 +++++ .../sgemm/src/visc_vec_opt/Makefile | 8 + .../src/{hpvm_vec_opt => visc_vec_opt}/io.cc | 0 .../{hpvm_vec_opt => visc_vec_opt}/kernel.cl | 0 .../{hpvm_vec_opt => visc_vec_opt}/main.cc | 32 +- .../sgemm/src/visc_vec_opt/main.visc.ll | 889 +++++ hpvm/test/parboil/benchmarks/spmv/Makefile | 4 +- .../benchmarks/spmv/src/opencl_cpu/main.c | 10 +- .../spmv/src/opencl_cpu_baseline/main.c | 2 +- .../spmv/src/opencl_cpu_huge/main.c | 10 +- .../spmv/src/opencl_cpu_large/main.c | 10 +- .../benchmarks/spmv/src/opencl_nvidia/main.c | 2 +- .../spmv/src/opencl_nvidia_huge/main.c | 10 +- .../spmv/src/opencl_nvidia_large/main.c | 10 +- .../spmv/src/{hpvm => visc}/Makefile | 4 +- .../spmv/src/{hpvm => visc}/file.cpp | 0 .../benchmarks/spmv/src/{hpvm => visc}/file.h | 0 .../spmv/src/{hpvm => visc}/gpu_info.cpp | 0 .../spmv/src/{hpvm => visc}/gpu_info.h | 0 .../spmv/src/{hpvm => visc}/kernel.cl | 0 .../spmv/src/{hpvm => visc}/main.cpp | 172 +- .../spmv/src/visc/main.visc.ll.kernels.bc | Bin 0 -> 2464 bytes .../spmv/src/visc/main.visc.ll.kernels.ll | 138 + hpvm/test/parboil/benchmarks/stencil/Makefile | 4 +- .../benchmarks/stencil/src/opencl_base/main.c | 2 +- .../stencil/src/opencl_base_default/main.c | 10 +- .../stencil/src/opencl_base_large/main.c | 10 +- .../stencil/src/opencl_base_strided/main.c | 10 +- .../stencil/src/opencl_base_vec/main.c | 10 +- .../benchmarks/stencil/src/opencl_cpu/main.c | 10 +- .../stencil/src/opencl_cpu_baseline/main.c | 6 +- .../src/opencl_cpu_default/kernel-spir64.ll | 2 +- .../stencil/src/opencl_cpu_default/main.c | 12 +- .../src/opencl_cpu_large/kernel-spir64.ll | 2 +- .../stencil/src/opencl_cpu_large/main.c | 12 +- .../stencil/src/{hpvm => visc}/Makefile | 4 +- .../stencil/src/{hpvm => visc}/common.h | 0 .../stencil/src/{hpvm => visc}/file.cc | 0 .../stencil/src/{hpvm => visc}/file.h | 0 .../stencil/src/{hpvm => visc}/kernel.cl | 0 .../stencil/src/{hpvm => visc}/stencil.cpp | 140 +- .../src/{hpvm_vec => visc_vec}/common.h | 0 .../src/{hpvm_vec => visc_vec}/stencil.c | 26 +- .../stencil/src/visc_vec/stencil.visc.ll | 673 ++++ hpvm/test/parboil/benchmarks/tpacf/Makefile | 4 +- .../benchmarks/tpacf/src/opencl_base/main.cc | 2 +- .../tpacf/src/opencl_cpu_base/main.cc | 2 +- .../tpacf/src/{hpvm => visc}/Makefile | 4 +- .../tpacf/src/{hpvm => visc}/args.cc | 0 .../tpacf/src/{hpvm => visc}/args.h | 0 .../tpacf/src/{hpvm => visc}/main.cc | 128 +- .../tpacf/src/{hpvm => visc}/model.cc | 0 .../tpacf/src/{hpvm => visc}/model.h | 0 hpvm/test/parboil/common/include/parboil.h | 32 +- .../parboil/common/mk/{hpvm.mk => visc.mk} | 58 +- .../{hpvm.default.mk => visc.default.mk} | 18 +- hpvm/test/parboil/driver/options.py | 40 +- hpvm/test/parboil/parboilParser.py | 16 +- hpvm/test/pipeline/Makefile | 34 +- hpvm/test/pipeline/copyToVersions.sh | 6 +- ...adient.visc.merged.experiments.notimer.ll} | 200 +- ...lacian.visc.merged.experiments.notimer.ll} | 204 +- hpvm/test/pipeline/run.sh | 2 +- hpvm/test/pipeline/runscript.sh | 18 +- hpvm/test/pipeline/src/Makefile | 4 +- hpvm/test/pipeline/src/main.cc | 452 +-- hpvm/test/unitTests/CreateNodeAndEdge.c | 38 +- hpvm/test/unitTests/Makefile | 4 +- hpvm/test/unitTests/MallocIntrinsic.c | 18 +- hpvm/test/unitTests/PipelineIntrinsics.c | 16 +- .../unitTests/PipelineIntrinsics.malloc.c | 16 +- hpvm/test/unitTests/temp/3level.ll | 50 +- hpvm/test/unitTests/temp/Makefile | 4 +- hpvm/test/unitTests/temp/query2D.ll | 52 +- hpvm/test/unitTests/temp/query3D.ll | 56 +- hpvm/test/unitTests/temp/queryNodeInst.ll | 44 +- hpvm/test/unitTests/temp/queryNumDim.ll | 46 +- hpvm/test/unitTests/temp/queryNumNodeInst.ll | 50 +- hpvm/test/unitTests/temp/singleNode.ll | 30 +- hpvm/test/unitTests/temp/singleNodeStream.ll | 56 +- hpvm/test/unitTests/temp/twoLaunch.ll | 38 +- hpvm/test/unitTests/temp/twoNode.ll | 34 +- hpvm/test/unitTests/temp/twoNodeConnect.ll | 36 +- hpvm/test/unitTests/temp/twoNodeQuery.ll | 46 +- hpvm/test/unitTests/temp/twoNodeStream.ll | 50 +- 220 files changed, 9761 insertions(+), 6419 deletions(-) rename hpvm/include/{GenHPVM/GenHPVM.h => GenVISC/GenVISC.h} (67%) delete mode 100644 hpvm/include/SupportHPVM/HPVMTimer.h rename hpvm/include/{SupportHPVM => SupportVISC}/DFG2LLVM.h (82%) rename hpvm/include/{SupportHPVM => SupportVISC}/DFGTreeTraversal.h (100%) rename hpvm/include/{SupportHPVM => SupportVISC}/DFGraph.h (94%) rename hpvm/include/{SupportHPVM/HPVMHint.h => SupportVISC/VISCHint.h} (78%) create mode 100644 hpvm/include/SupportVISC/VISCTimer.h rename hpvm/include/{SupportHPVM/HPVMUtils.h => SupportVISC/VISCUtils.h} (84%) delete mode 100644 hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp rename hpvm/lib/Transforms/{GenHPVM => GenVISC}/CMakeLists.txt (74%) create mode 100644 hpvm/lib/Transforms/GenVISC/GenVISC.cpp rename hpvm/lib/Transforms/{GenHPVM/GenHPVM.exports => GenVISC/GenVISC.exports} (100%) rename hpvm/lib/Transforms/{GenHPVM => GenVISC}/LLVMBuild.txt (88%) delete mode 100644 hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td create mode 100644 hpvm/llvm_patches/include/IR/IntrinsicsVISC.td delete mode 100644 hpvm/projects/hpvm-rt/CMakeLists.txt create mode 100644 hpvm/projects/visc-rt/CMakeLists.txt rename hpvm/projects/{hpvm-rt => visc-rt}/deviceStatusSwitchIntervals.txt (100%) rename hpvm/projects/{hpvm-rt => visc-rt}/device_abstraction.h (96%) rename hpvm/projects/{hpvm-rt => visc-rt}/makefile (97%) rename hpvm/projects/{hpvm-rt => visc-rt}/policy.h (100%) rename hpvm/projects/{hpvm-rt/hpvm-rt.cpp => visc-rt/visc-rt.cpp} (82%) rename hpvm/projects/{hpvm-rt/hpvm-rt.h => visc-rt/visc-rt.h} (72%) delete mode 100644 hpvm/test/include/hpvm.h create mode 100644 hpvm/test/include/visc.h rename hpvm/test/parboil/benchmarks/bfs/src/{hpvm => visc}/Makefile (81%) rename hpvm/test/parboil/benchmarks/bfs/src/{hpvm => visc}/config.h (100%) rename hpvm/test/parboil/benchmarks/bfs/src/{hpvm => visc}/main.cpp (70%) rename hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/{kernel_hpvm.cl => kernel_visc.cl} (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/{kernel_hpvm_x64.ll => kernel_visc_x64.ll} (99%) rename hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/{kernel_hpvm_x64.spir => kernel_visc_x64.spir} (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/Makefile (85%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/atom.h (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/cutcpu.c (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/cutoff.h (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/cutoff6overlap.c (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/excl.c (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/macros.h (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/main.cpp (82%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/ocl.c (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/ocl.h (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/output.c (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/output.h (100%) rename hpvm/test/parboil/benchmarks/cutcp/src/{hpvm => visc}/readatom.c (100%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/Makefile (85%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/layout_config.h (100%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/lbm.cpp (100%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/lbm.h (100%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/lbm_macros.h (100%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/main.cpp (86%) rename hpvm/test/parboil/benchmarks/lbm/src/{hpvm => visc}/main.h (100%) delete mode 100644 hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm => visc}/Makefile (83%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm => visc}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm => visc}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm => visc}/main.cc (69%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc => visc_opt}/Makefile (83%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_opt => visc_opt}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_opt => visc_opt}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_opt => visc_opt}/main.cc (90%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_sh => visc_sh}/Makefile (86%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_sh => visc_sh}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_sh => visc_sh}/main.cc (65%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec => visc_tc}/Makefile (83%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc => visc_tc}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc => visc_tc}/main.cc (90%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc_vec => visc_tc_vec}/Makefile (83%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc_vec => visc_tc_vec}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_tc_vec => visc_tc_vec}/main.cc (90%) create mode 100644 hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_opt => visc_vec}/Makefile (83%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec => visc_vec}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec => visc_vec}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec => visc_vec}/main.cc (90%) create mode 100644 hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll create mode 100644 hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec_opt => visc_vec_opt}/io.cc (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec_opt => visc_vec_opt}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/sgemm/src/{hpvm_vec_opt => visc_vec_opt}/main.cc (91%) create mode 100644 hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/Makefile (88%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/file.cpp (100%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/file.h (100%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/gpu_info.cpp (100%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/gpu_info.h (100%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/spmv/src/{hpvm => visc}/main.cpp (68%) create mode 100644 hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc create mode 100644 hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/Makefile (80%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/common.h (100%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/file.cc (100%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/file.h (100%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/kernel.cl (100%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm => visc}/stencil.cpp (66%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm_vec => visc_vec}/common.h (100%) rename hpvm/test/parboil/benchmarks/stencil/src/{hpvm_vec => visc_vec}/stencil.c (90%) create mode 100644 hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/Makefile (82%) rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/args.cc (100%) rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/args.h (100%) rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/main.cc (76%) rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/model.cc (100%) rename hpvm/test/parboil/benchmarks/tpacf/src/{hpvm => visc}/model.h (100%) rename hpvm/test/parboil/common/mk/{hpvm.mk => visc.mk} (81%) rename hpvm/test/parboil/common/platform/{hpvm.default.mk => visc.default.mk} (61%) rename hpvm/test/pipeline/{gradient.hpvm.merged.experiments.notimer.ll => gradient.visc.merged.experiments.notimer.ll} (95%) rename hpvm/test/pipeline/{laplacian.hpvm.merged.experiments.notimer.ll => laplacian.visc.merged.experiments.notimer.ll} (95%) diff --git a/.gitignore b/.gitignore index 0da6a36714..a17e2716a5 100644 --- a/.gitignore +++ b/.gitignore @@ -34,5 +34,5 @@ hpvm/install/ hpvm/llvm/ hpvm/llvm-*.src.tar.xz hpvm/llvm-*.src/ -hpvm/projects/hpvm-rt/hpvm-rt.ll +hpvm/projects/visc-rt/visc-rt.ll hpvm/test/**/build/ diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h index ca4c616da5..28230e135b 100644 --- a/hpvm/include/BuildDFG/BuildDFG.h +++ b/hpvm/include/BuildDFG/BuildDFG.h @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "SupportHPVM/DFGraph.h" +#include "SupportVISC/DFGraph.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -58,10 +58,10 @@ public: // Functions virtual bool runOnModule(Module &M); - static bool isHPVMLaunchIntrinsic(Instruction *I); - static bool isHPVMGraphIntrinsic(Instruction *I); - static bool isHPVMQueryIntrinsic(Instruction *I); - static bool isHPVMIntrinsic(Instruction *I); + static bool isViscLaunchIntrinsic(Instruction *I); + static bool isViscGraphIntrinsic(Instruction *I); + static bool isViscQueryIntrinsic(Instruction *I); + static bool isViscIntrinsic(Instruction *I); static bool isTypeCongruent(Type *L, Type *R); // TODO: Maybe make these fields const diff --git a/hpvm/include/GenHPVM/GenHPVM.h b/hpvm/include/GenVISC/GenVISC.h similarity index 67% rename from hpvm/include/GenHPVM/GenHPVM.h rename to hpvm/include/GenVISC/GenVISC.h index 24798bc274..1db9929be7 100644 --- a/hpvm/include/GenHPVM/GenHPVM.h +++ b/hpvm/include/GenVISC/GenVISC.h @@ -1,4 +1,4 @@ -//== GenHPVM.h - Header file for "LLVM IR to HPVM IR Pass" =// +//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =// // // The LLVM Compiler Infrastructure // @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "SupportHPVM/HPVMTimer.h" +#include "SupportVISC/VISCTimer.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -18,24 +18,24 @@ using namespace llvm; -namespace genhpvm { -// GenHPVM - The first implementation. -struct GenHPVM : public ModulePass { +namespace genvisc { +// GenVISC - The first implementation. +struct GenVISC : public ModulePass { static char ID; // Pass identification, replacement for typeid - GenHPVM() : ModulePass(ID) {} + GenVISC() : ModulePass(ID) {} private: // Member variables Module *M; - FunctionCallee llvm_hpvm_initializeTimerSet; - FunctionCallee llvm_hpvm_switchToTimer; - FunctionCallee llvm_hpvm_printTimerSet; + FunctionCallee llvm_visc_initializeTimerSet; + FunctionCallee llvm_visc_switchToTimer; + FunctionCallee llvm_visc_printTimerSet; GlobalVariable *TimerSet; // Functions void initializeTimerSet(Instruction *); - void switchToTimer(enum hpvm_TimerID, Instruction *); + void switchToTimer(enum visc_TimerID, Instruction *); void printTimerSet(Instruction *); Value *getStringPointer(const Twine &S, Instruction *InsertBefore, const Twine &Name = ""); @@ -45,4 +45,4 @@ public: virtual bool runOnModule(Module &M); }; -} // namespace genhpvm +} // namespace genvisc diff --git a/hpvm/include/SupportHPVM/HPVMTimer.h b/hpvm/include/SupportHPVM/HPVMTimer.h deleted file mode 100644 index 05b24d41d6..0000000000 --- a/hpvm/include/SupportHPVM/HPVMTimer.h +++ /dev/null @@ -1,151 +0,0 @@ -//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef HPVM_TIMER_HEADER -#define HPVM_TIMER_HEADER - -/************************** Timer Routines ***************************/ -extern "C" { - -/* A time or duration. */ -//#if _POSIX_VERSION >= 200112L -typedef unsigned long long hpvm_Timestamp; /* time in microseconds */ -//#else -//# error "Timestamps not implemented" -//#endif - -enum hpvm_TimerState { - hpvm_Timer_STOPPED, - hpvm_Timer_RUNNING, -}; - -struct hpvm_Timer { - enum hpvm_TimerState state; - hpvm_Timestamp elapsed; /* Amount of time elapsed so far */ - hpvm_Timestamp init; /* Beginning of the current time interval, - * if state is RUNNING. End of the last - * recorded time interfal otherwise. */ -}; - -/* Reset a timer. - * Use this to initialize a timer or to clear - * its elapsed time. The reset timer is stopped. - */ -void hpvm_ResetTimer(struct hpvm_Timer *timer); - -/* Start a timer. The timer is set to RUNNING mode and - * time elapsed while the timer is running is added to - * the timer. - * The timer should not already be running. - */ -void hpvm_StartTimer(struct hpvm_Timer *timer); - -/* Stop a timer. - * This stops adding elapsed time to the timer. - * The timer should not already be stopped. - */ -void hpvm_StopTimer(struct hpvm_Timer *timer); - -/* Get the elapsed time in seconds. */ -double hpvm_GetElapsedTime(struct hpvm_Timer *timer); - -/* Execution time is assigned to one of these categories. */ -enum hpvm_TimerID { - hpvm_TimerID_NONE = 0, - hpvm_TimerID_IO, /* Time spent in input/output */ - hpvm_TimerID_KERNEL, /* Time spent computing on the device, - * recorded asynchronously */ - hpvm_TimerID_COPY, /* Time spent synchronously moving data - * to/from device and allocating/freeing - * memory on the device */ - hpvm_TimerID_DRIVER, /* Time spent in the host interacting with the - * driver, primarily for recording the time - * spent queueing asynchronous operations */ - hpvm_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ - hpvm_TimerID_COMPUTE, /* Time for all program execution other - * than parsing command line arguments, - * I/O, kernel, and copy */ - hpvm_TimerID_OVERLAP, /* Time double-counted in asynchronous and - * host activity: automatically filled in, - * not intended for direct usage */ - // GPU FUNCTION - hpvm_TimerID_INIT_CTX, - hpvm_TimerID_CLEAR_CTX, - hpvm_TimerID_COPY_SCALAR, - hpvm_TimerID_COPY_PTR, - hpvm_TimerID_MEM_FREE, - hpvm_TimerID_READ_OUTPUT, - hpvm_TimerID_SETUP, - hpvm_TimerID_MEM_TRACK, - hpvm_TimerID_MEM_UNTRACK, - hpvm_TimerID_MISC, - // LAUNCH FUNCTION - hpvm_TimerID_PTHREAD_CREATE, - hpvm_TimerID_ARG_PACK, - hpvm_TimerID_ARG_UNPACK, - hpvm_TimerID_COMPUTATION, - hpvm_TimerID_OUTPUT_PACK, - hpvm_TimerID_OUTPUT_UNPACK, - - hpvm_TimerID_LAST /* Number of timer IDs */ -}; - -/* Dynamic list of asynchronously tracked times between events */ -struct hpvm_async_time_marker_list { - char *label; // actually just a pointer to a string - enum hpvm_TimerID timerID; /* The ID to which the interval beginning - * with this marker should be attributed */ - void *marker; - // cudaEvent_t marker; /* The driver event for this marker */ - struct hpvm_async_time_marker_list *next; -}; - -struct hpvm_SubTimer { - char *label; - struct hpvm_Timer timer; - struct hpvm_SubTimer *next; -}; - -struct hpvm_SubTimerList { - struct hpvm_SubTimer *current; - struct hpvm_SubTimer *subtimer_list; -}; - -/* A set of timers for recording execution times. */ -struct hpvm_TimerSet { - enum hpvm_TimerID current; - struct hpvm_async_time_marker_list *async_markers; - hpvm_Timestamp async_begin; - hpvm_Timestamp wall_begin; - struct hpvm_Timer timers[hpvm_TimerID_LAST]; - struct hpvm_SubTimerList *sub_timer_list[hpvm_TimerID_LAST]; -}; - -/* Reset all timers in the set. */ -void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers); - -void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, - enum hpvm_TimerID hpvm_Category); - -/* Select which timer the next interval of time should be accounted - * to. The selected timer is started and other timers are stopped. - * Using hpvm_TimerID_NONE stops all timers. */ -inline void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, - enum hpvm_TimerID timer); - -void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, - enum hpvm_TimerID category); - -/* Print timer values to standard output. */ -void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers); - -/* Release timer resources */ -void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers); -} -#endif // HPVM_RT_HEADER diff --git a/hpvm/include/SupportHPVM/DFG2LLVM.h b/hpvm/include/SupportVISC/DFG2LLVM.h similarity index 82% rename from hpvm/include/SupportHPVM/DFG2LLVM.h rename to hpvm/include/SupportVISC/DFG2LLVM.h index 07147c6d90..b9e4cc4158 100644 --- a/hpvm/include/SupportHPVM/DFG2LLVM.h +++ b/hpvm/include/SupportVISC/DFG2LLVM.h @@ -1,7 +1,7 @@ #ifndef __DFG2LLVM_H__ #define __DFG2LLVM_H__ -//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===// +//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// // // The LLVM Compiler Infrastructure // @@ -11,9 +11,9 @@ //===----------------------------------------------------------------------===// #include "BuildDFG/BuildDFG.h" -#include "SupportHPVM/HPVMHint.h" -#include "SupportHPVM/HPVMTimer.h" -#include "SupportHPVM/HPVMUtils.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCTimer.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -26,7 +26,7 @@ using namespace builddfg; #define TIMER(X) \ do { \ - if (HPVMTimer) { \ + if (VISCTimer) { \ X; \ } \ } while (0) @@ -37,8 +37,8 @@ using namespace builddfg; namespace dfg2llvm { // Helper Functions -static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID); -static inline ConstantInt *getTimerID(Module &, enum hpvm::Target); +static inline ConstantInt *getTimerID(Module &, enum visc_TimerID); +static inline ConstantInt *getTimerID(Module &, enum visc::Target); bool hasAttribute(Function *, unsigned, Attribute::AttrKind); @@ -69,7 +69,7 @@ protected: // Member variables Module &M; BuildDFG &DFG; - bool HPVMTimer = false; + bool VISCTimer = false; std::string TargetName = "None"; // Map from Old function associated with DFNode to new cloned function with @@ -78,12 +78,12 @@ protected: // "Have we visited this function before?") DenseMap<DFNode *, Value *> OutputMap; - // HPVM Runtime API + // VISC Runtime API std::unique_ptr<Module> runtimeModule; - FunctionCallee llvm_hpvm_initializeTimerSet; - FunctionCallee llvm_hpvm_switchToTimer; - FunctionCallee llvm_hpvm_printTimerSet; + FunctionCallee llvm_visc_initializeTimerSet; + FunctionCallee llvm_visc_switchToTimer; + FunctionCallee llvm_visc_printTimerSet; GlobalVariable *TimerSet; GlobalVariable *GraphIDAddr; Instruction *InitCall; @@ -109,7 +109,7 @@ protected: // Virtual Functions virtual void initializeTimerSet(Instruction *); - virtual void switchToTimer(enum hpvm_TimerID, Instruction *); + virtual void switchToTimer(enum visc_TimerID, Instruction *); virtual void printTimerSet(Instruction *); virtual ~CodeGenTraversal() {} @@ -118,9 +118,9 @@ public: // Constructor CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} - static bool checkPreferredTarget(DFNode *N, hpvm::Target T); - static bool preferredTargetIncludes(DFNode *N, hpvm::Target T); - hpvm::Target getPreferredTarget(DFNode *N); + static bool checkPreferredTarget(DFNode *N, visc::Target T); + static bool preferredTargetIncludes(DFNode *N, visc::Target T); + visc::Target getPreferredTarget(DFNode *N); virtual void visit(DFInternalNode *N) { // If code has already been generated for this internal node, skip the @@ -157,25 +157,25 @@ public: // -------------- CodeGenTraversal Implementation ----------------- -bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) { +bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) { Function *F = N->getFuncPointer(); Module *M = F->getParent(); NamedMDNode *HintNode; switch (T) { - case hpvm::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); break; - case hpvm::SPIR_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_spir"); + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); break; - case hpvm::CUDNN_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cudnn"); + case visc::CUDNN_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); break; - case hpvm::PROMISE_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise"); + case visc::PROMISE_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); break; - case hpvm::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); break; default: llvm_unreachable("Target Not supported yet!"); @@ -190,37 +190,37 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) { return false; } -hpvm::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { - return hpvmUtils::getPreferredTarget(N->getFuncPointer()); +visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { + return viscUtils::getPreferredTarget(N->getFuncPointer()); } -bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) { +bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) { Function *F = N->getFuncPointer(); Module *M = F->getParent(); std::vector<NamedMDNode *> HintNode; switch (T) { - case hpvm::GPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu")); + case visc::GPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); break; - case hpvm::SPIR_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_spir")); - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir")); + case visc::SPIR_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); break; - case hpvm::CPU_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu")); - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir")); + case visc::CPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); break; - case hpvm::CUDNN_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cudnn")); + case visc::CUDNN_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn")); break; - case hpvm::PROMISE_TARGET: - HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_promise")); + case visc::PROMISE_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise")); break; - case hpvm::CPU_OR_GPU_TARGET: - case hpvm::CPU_OR_SPIR_TARGET: + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: assert(false && "Target should be one of CPU/GPU/SPIR\n"); break; default: @@ -308,11 +308,11 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty, Function *newF = Function::Create(FTy, F->getLinkage(), F->getName() + "_cloned", F->getParent()); renameNewArgument(newF, name); - newF = hpvmUtils::cloneFunction(F, newF, false); + newF = viscUtils::cloneFunction(F, newF, false); // Check if the function is used by a metadata node if (F->isUsedByMetadata()) { - hpvmUtils::fixHintMetadata(*F->getParent(), F, newF); + viscUtils::fixHintMetadata(*F->getParent(), F, newF); } return newF; @@ -396,32 +396,32 @@ Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) { } void CodeGenTraversal::initTimerAPI() { - DECLARE(llvm_hpvm_initializeTimerSet); - DECLARE(llvm_hpvm_switchToTimer); - DECLARE(llvm_hpvm_printTimerSet); + DECLARE(llvm_visc_initializeTimerSet); + DECLARE(llvm_visc_switchToTimer); + DECLARE(llvm_visc_printTimerSet); } // Timer Routines // Initialize the timer set void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) { - // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet << + // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << // "\n"); TIMER(TimerSet = new GlobalVariable( M, Type::getInt8PtrTy(M.getContext()), false, GlobalValue::CommonLinkage, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - Twine("hpvmTimerSet_") + TargetName); + Twine("viscTimerSet_") + TargetName); DEBUG(errs() << "New global variable: " << *TimerSet << "\n"); - Value *TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, + Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, None, "", InsertBefore); new StoreInst(TimerSetAddr, TimerSet, InsertBefore);); } -void CodeGenTraversal::switchToTimer(enum hpvm_TimerID timer, +void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, Instruction *InsertBefore) { Value *switchArgs[] = {TimerSet, getTimerID(M, timer)}; - TIMER(CallInst::Create(llvm_hpvm_switchToTimer, + TIMER(CallInst::Create(llvm_visc_switchToTimer, ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); } @@ -430,16 +430,16 @@ void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) { TIMER(TimerName = getStringPointer(TargetName + Twine("_Timer"), InsertBefore)); Value *printArgs[] = {TimerSet, TimerName}; - TIMER(CallInst::Create(llvm_hpvm_printTimerSet, + TIMER(CallInst::Create(llvm_visc_printTimerSet, ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); } // Implementation of Helper Functions -static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) { +static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); } -static inline ConstantInt *getTargetID(Module &M, enum hpvm::Target T) { +static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), T); } diff --git a/hpvm/include/SupportHPVM/DFGTreeTraversal.h b/hpvm/include/SupportVISC/DFGTreeTraversal.h similarity index 100% rename from hpvm/include/SupportHPVM/DFGTreeTraversal.h rename to hpvm/include/SupportVISC/DFGTreeTraversal.h diff --git a/hpvm/include/SupportHPVM/DFGraph.h b/hpvm/include/SupportVISC/DFGraph.h similarity index 94% rename from hpvm/include/SupportHPVM/DFGraph.h rename to hpvm/include/SupportVISC/DFGraph.h index d904e2401d..0c224a344c 100644 --- a/hpvm/include/SupportHPVM/DFGraph.h +++ b/hpvm/include/SupportVISC/DFGraph.h @@ -20,8 +20,8 @@ #ifndef LLVM_IR_DFGRAPH_H #define LLVM_IR_DFGRAPH_H -#include "SupportHPVM/HPVMHint.h" -#include "SupportHPVM/HPVMUtils.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -158,7 +158,7 @@ public: } }; -// DFNode represents a single HPVM Dataflow Node in LLVM. +// DFNode represents a single VISC Dataflow Node in LLVM. // // A Dataflow Node basically consists of // 1. Pointer to a function describing this dataflow node @@ -210,8 +210,8 @@ private: ///< hierarchy unsigned Rank; ///< Ordering based on toplogical sort const DFNodeKind Kind; ///< Kind of Node Internal/Leaf - hpvm::Target Tag; ///< Code Generated for which backend - hpvm::Target Hint; ///< To store preferred backend + visc::Target Tag; ///< Code Generated for which backend + visc::Target Hint; ///< To store preferred backend public: virtual ~DFNode() { @@ -287,13 +287,13 @@ public: DFNodeKind getKind() const { return Kind; } - DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, + DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, std::vector<Value *> _DimLimits, DFNodeKind _K); bool isRoot() const { // It is a root node is it was created from a launch intrinsic - if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) { + if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) { assert(Level == 0 && "Root node's level is zero."); return true; } @@ -326,9 +326,9 @@ public: unsigned getRank() const { return Rank; } - void setTag(hpvm::Target T) { Tag = T; } + void setTag(visc::Target T) { Tag = T; } - hpvm::Target getTag() const { return Tag; } + visc::Target getTag() const { return Tag; } void *getProperty(PropertyKind PType) { assert(PropertyList.count(PType) == 1 && @@ -342,24 +342,24 @@ public: PropertyList[PType] = PValue; } - void setGenFunc(Function *F, hpvm::Target T) { + void setGenFunc(Function *F, visc::Target T) { GenFunc = F; Tag = T; } Function *getGenFunc() const { return GenFunc; } - void setHasX86FuncForTarget(hpvm::Target T, bool isX86Func) { + void setHasX86FuncForTarget(visc::Target T, bool isX86Func) { switch (T) { - case hpvm::None: + case visc::None: return; // Do nothing. - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: GenFuncInfo.cpu_hasX86Func = isX86Func; break; - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: GenFuncInfo.gpu_hasX86Func = isX86Func; break; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: break; default: assert(false && "Unknown target\n"); @@ -368,15 +368,15 @@ public: return; } - bool hasX86GenFuncForTarget(hpvm::Target T) const { + bool hasX86GenFuncForTarget(visc::Target T) const { switch (T) { - case hpvm::None: + case visc::None: return false; - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: return GenFuncInfo.cpu_hasX86Func; - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: return GenFuncInfo.gpu_hasX86Func; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n"); default: assert(false && "Unknown target\n"); @@ -384,10 +384,10 @@ public: return false; } - void addGenFunc(Function *F, hpvm::Target T, bool isX86Func) { + void addGenFunc(Function *F, visc::Target T, bool isX86Func) { switch (T) { - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: if (GenFuncs.CPUGenFunc != NULL) { DEBUG(errs() << "Warning: Second generated CPU function for node " << FuncPointer->getName() << "\n"); @@ -395,7 +395,7 @@ public: GenFuncs.CPUGenFunc = F; GenFuncInfo.cpu_hasX86Func = isX86Func; break; - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: if (GenFuncs.GPUGenFunc != NULL) { DEBUG(errs() << "Warning: Second generated GPU function for node " << FuncPointer->getName() << "\n"); @@ -403,25 +403,25 @@ public: GenFuncs.GPUGenFunc = F; GenFuncInfo.gpu_hasX86Func = isX86Func; break; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: assert(false && "A node function should be set with a tag specifying its \ type, not the node hint itself\n"); default: assert(false && "Unknown target for generated function\n"); } - Tag = hpvmUtils::getUpdatedTag(Tag, T); + Tag = viscUtils::getUpdatedTag(Tag, T); } - Function *getGenFuncForTarget(hpvm::Target T) const { + Function *getGenFuncForTarget(visc::Target T) const { switch (T) { - case hpvm::None: + case visc::None: return NULL; - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: return GenFuncs.CPUGenFunc; - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: return GenFuncs.GPUGenFunc; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: assert(false && "Requesting genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); @@ -431,19 +431,19 @@ public: return NULL; } - void removeGenFuncForTarget(hpvm::Target T) { + void removeGenFuncForTarget(visc::Target T) { switch (T) { - case hpvm::None: + case visc::None: return; - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: GenFuncs.CPUGenFunc = NULL; GenFuncInfo.cpu_hasX86Func = false; break; - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: GenFuncs.GPUGenFunc = NULL; GenFuncInfo.gpu_hasX86Func = false; break; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: assert(false && "Removing genarated node function with dual tag instead of \ CPU/GPU/SPIR/CUDNN/PROMISE\n"); @@ -453,9 +453,9 @@ public: return; } - void setTargetHint(hpvm::Target T) { Hint = T; } + void setTargetHint(visc::Target T) { Hint = T; } - hpvm::Target getTargetHint() const { return Hint; } + visc::Target getTargetHint() const { return Hint; } bool isDummyNode() const { return isEntryNode() || isExitNode(); } @@ -496,7 +496,7 @@ private: DFGraph *childGraph; ///< Pointer to dataflow graph // Constructor - DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, + DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, DFInternalNode *Parent, int NumOfDim, std::vector<Value *> DimLimits) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, @@ -508,7 +508,7 @@ private: public: static DFInternalNode * Create(IntrinsicInst *II, Function *FuncPointer, - hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL, + visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) { @@ -539,14 +539,14 @@ class DFLeafNode : public DFNode { private: // Constructor - DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, + DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {} public: static DFLeafNode * - Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, + Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, std::vector<Value *> DimLimits = std::vector<Value *>()) { return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits); @@ -558,7 +558,7 @@ public: // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ }; -// DFEdge represents a single HPVM Dataflow Edge in LLVM. +// DFEdge represents a single VISC Dataflow Edge in LLVM. // // A Dataflow Edge basically consists of // 1. Pointer to the dataflow node that is the source of this edge @@ -634,8 +634,8 @@ DFGraph::DFGraph(DFInternalNode *P) { Parent = P; // Create dummy entry and exit nodes and add them to the graph Entry = - DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent); - Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent); + DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); + Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent); addChildDFNode(Entry); addChildDFNode(Exit); } @@ -655,7 +655,7 @@ bool DFGraph::isStreaming() { } //===--------------------- DFNode Outlined Functions --------------===// -DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, +DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, std::vector<Value *> _DimLimits, DFNodeKind _K) : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim), @@ -663,7 +663,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, Type *Ty = FuncPointer->getFunctionType()->getReturnType(); - // Allow the return type to be void too, in the hHPVM IR. If return type is + // Allow the return type to be void too, in the hVISC IR. If return type is // void, create an empty struct type and keep that as the return type of the // node. if (Ty->isVoidTy()) @@ -683,7 +683,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, Level = (_Parent) ? _Parent->getLevel() + 1 : 0; Rank = 0; - Tag = hpvm::None; + Tag = visc::None; GenFuncs.CPUGenFunc = NULL; GenFuncs.GPUGenFunc = NULL; GenFuncs.SPIRGenFunc = NULL; diff --git a/hpvm/include/SupportHPVM/HPVMHint.h b/hpvm/include/SupportVISC/VISCHint.h similarity index 78% rename from hpvm/include/SupportHPVM/HPVMHint.h rename to hpvm/include/SupportVISC/VISCHint.h index 1ef4c6eb3b..99266b0718 100644 --- a/hpvm/include/SupportHPVM/HPVMHint.h +++ b/hpvm/include/SupportVISC/VISCHint.h @@ -1,4 +1,4 @@ -//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===// +//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// // // The LLVM Compiler Infrastructure // @@ -7,12 +7,12 @@ // //===----------------------------------------------------------------------===// -#ifndef HPVM_HINT_HEADER -#define HPVM_HINT_HEADER +#ifndef VISC_HINT_HEADER +#define VISC_HINT_HEADER /************************** Hint Routines ***************************/ #ifdef __cplusplus -namespace hpvm { +namespace visc { #endif enum Target { @@ -32,4 +32,4 @@ enum Target { } #endif -#endif // HPVM_HINT_HEADER +#endif // VISC_HINT_HEADER diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h new file mode 100644 index 0000000000..ce3dc8a5e0 --- /dev/null +++ b/hpvm/include/SupportVISC/VISCTimer.h @@ -0,0 +1,151 @@ +//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef VISC_TIMER_HEADER +#define VISC_TIMER_HEADER + +/************************** Timer Routines ***************************/ +extern "C" { + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long visc_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum visc_TimerState { + visc_Timer_STOPPED, + visc_Timer_RUNNING, +}; + +struct visc_Timer { + enum visc_TimerState state; + visc_Timestamp elapsed; /* Amount of time elapsed so far */ + visc_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void visc_ResetTimer(struct visc_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void visc_StartTimer(struct visc_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void visc_StopTimer(struct visc_Timer *timer); + +/* Get the elapsed time in seconds. */ +double visc_GetElapsedTime(struct visc_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum visc_TimerID { + visc_TimerID_NONE = 0, + visc_TimerID_IO, /* Time spent in input/output */ + visc_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + visc_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + visc_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + visc_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + visc_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + // GPU FUNCTION + visc_TimerID_INIT_CTX, + visc_TimerID_CLEAR_CTX, + visc_TimerID_COPY_SCALAR, + visc_TimerID_COPY_PTR, + visc_TimerID_MEM_FREE, + visc_TimerID_READ_OUTPUT, + visc_TimerID_SETUP, + visc_TimerID_MEM_TRACK, + visc_TimerID_MEM_UNTRACK, + visc_TimerID_MISC, + // LAUNCH FUNCTION + visc_TimerID_PTHREAD_CREATE, + visc_TimerID_ARG_PACK, + visc_TimerID_ARG_UNPACK, + visc_TimerID_COMPUTATION, + visc_TimerID_OUTPUT_PACK, + visc_TimerID_OUTPUT_UNPACK, + + visc_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct visc_async_time_marker_list { + char *label; // actually just a pointer to a string + enum visc_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void *marker; + // cudaEvent_t marker; /* The driver event for this marker */ + struct visc_async_time_marker_list *next; +}; + +struct visc_SubTimer { + char *label; + struct visc_Timer timer; + struct visc_SubTimer *next; +}; + +struct visc_SubTimerList { + struct visc_SubTimer *current; + struct visc_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct visc_TimerSet { + enum visc_TimerID current; + struct visc_async_time_marker_list *async_markers; + visc_Timestamp async_begin; + visc_Timestamp wall_begin; + struct visc_Timer timers[visc_TimerID_LAST]; + struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void visc_InitializeTimerSet(struct visc_TimerSet *timers); + +void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID visc_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using visc_TimerID_NONE stops all timers. */ +inline void visc_SwitchToTimer(struct visc_TimerSet *timers, + enum visc_TimerID timer); + +void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID category); + +/* Print timer values to standard output. */ +void visc_PrintTimerSet(struct visc_TimerSet *timers); + +/* Release timer resources */ +void visc_DestroyTimerSet(struct visc_TimerSet *timers); +} +#endif // VISC_RT_HEADER diff --git a/hpvm/include/SupportHPVM/HPVMUtils.h b/hpvm/include/SupportVISC/VISCUtils.h similarity index 84% rename from hpvm/include/SupportHPVM/HPVMUtils.h rename to hpvm/include/SupportVISC/VISCUtils.h index 25b9880180..0efd20b5b5 100644 --- a/hpvm/include/SupportHPVM/HPVMUtils.h +++ b/hpvm/include/SupportVISC/VISCUtils.h @@ -1,5 +1,5 @@ // -//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===// +//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// // // The LLVM Compiler Infrastructure // @@ -8,12 +8,12 @@ // //===----------------------------------------------------------------------===// -#ifndef HPVM_UTILS_HEADER -#define HPVM_UTILS_HEADER +#ifndef VISC_UTILS_HEADER +#define VISC_UTILS_HEADER #include <assert.h> -#include "SupportHPVM/HPVMHint.h" +#include "SupportVISC/VISCHint.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -29,31 +29,31 @@ using namespace llvm; -namespace hpvmUtils { +namespace viscUtils { // Helper Functions -static bool isHPVMCreateNodeIntrinsic(Instruction *I) { +static bool isViscCreateNodeIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); return (II->getCalledFunction()->getName()) - .startswith("llvm.hpvm.createNode"); + .startswith("llvm.visc.createNode"); } -static bool isHPVMCreateNodeCall(Instruction *I) { +static bool isViscCreateNodeCall(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .startswith("__hpvm__createNode"); + .startswith("__visc__createNode"); } -static bool isHPVMLaunchCall(Instruction *I) { +static bool isViscLaunchCall(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .startswith("__hpvm__launch"); + .startswith("__visc__launch"); } // Creates a new createNode intrinsic, similar to II but with different // associated function F instead @@ -69,22 +69,22 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, ArrayRef<Value *> CreateNodeArgs; switch (II->getIntrinsicID()) { - case Intrinsic::hpvm_createNode: { + case Intrinsic::visc_createNode: { CreateNodeArgs = ArrayRef<Value *>(Fp); break; } - case Intrinsic::hpvm_createNode1D: { + case Intrinsic::visc_createNode1D: { Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2); break; } - case Intrinsic::hpvm_createNode2D: { + case Intrinsic::visc_createNode2D: { Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3); break; } - case Intrinsic::hpvm_createNode3D: { + case Intrinsic::visc_createNode3D: { Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2), II->getArgOperand(3)}; CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4); @@ -101,7 +101,7 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, return CreateNodeII; } -// Fix HPVM hints for this function +// Fix VISC hints for this function void fixHintMetadata(Module &M, Function *F, Function *G) { Metadata *MD_F = ValueAsMetadata::getIfExists(F); MDTuple *MDT_F = @@ -119,9 +119,9 @@ void fixHintMetadata(Module &M, Function *F, Function *G) { } }; - FixHint("hpvm_hint_gpu"); - FixHint("hpvm_hint_cpu"); - FixHint("hpvm_hint_cpu_gpu"); + FixHint("visc_hint_gpu"); + FixHint("visc_hint_cpu"); + FixHint("visc_hint_cpu_gpu"); } // Assuming that the changed function is a node function, it is only used as a @@ -138,7 +138,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isHPVMCreateNodeIntrinsic(I)) { + if (isViscCreateNodeIntrinsic(I)) { IntrinsicInst *II = cast<IntrinsicInst>(I); // The found createNode is not associated with the changed function if (II->getArgOperand(0) != F) @@ -150,7 +150,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II); II->replaceAllUsesWith(CreateNodeII); toBeErased.push_back(II); - } else if (isHPVMCreateNodeCall(I)) { + } else if (isViscCreateNodeCall(I)) { CallInst *CI = cast<CallInst>(I); // The found createNode is not associated with the changed function if (CI->getArgOperand(1) != F) @@ -161,7 +161,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { // Replace use of F with use of G CI->setArgOperand(1, G); DEBUG(errs() << "Fixed use: " << *CI << "\n"); - } else if (isHPVMLaunchCall(I)) { + } else if (isViscLaunchCall(I)) { CallInst *CI = cast<CallInst>(I); // The found launch call is not associated with the changed function if (CI->getArgOperand(1)->stripPointerCasts() != F) @@ -370,21 +370,21 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg, //------------------- Helper Functions For Handling Hints -------------------// // Return true if 1st arg (tag) contains 2nd (target) -bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { +bool tagIncludesTarget(visc::Target Tag, visc::Target T) { switch (Tag) { - case hpvm::None: + case visc::None: return false; - case hpvm::CPU_TARGET: - if (T == hpvm::CPU_TARGET) + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) return true; return false; - case hpvm::GPU_TARGET: - if (T == hpvm::GPU_TARGET) + case visc::GPU_TARGET: + if (T == visc::GPU_TARGET) return true; return false; - case hpvm::CPU_OR_GPU_TARGET: - if ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) || - (T == hpvm::CPU_OR_GPU_TARGET)) + case visc::CPU_OR_GPU_TARGET: + if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) || + (T == visc::CPU_OR_GPU_TARGET)) return true; return false; default: @@ -392,41 +392,41 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { } } -bool isSingleTargetTag(hpvm::Target T) { - return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)); +bool isSingleTargetTag(visc::Target T) { + return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)); } // Add the specified target to the given tag -hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) { - assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)) && +visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) { + assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) && "The target is only allowed to be a single target: CPU, GPU, SPIR, " "CUDNN, PROMISE\n"); switch (Tag) { - case hpvm::None: + case visc::None: return T; - case hpvm::CPU_TARGET: - if (T == hpvm::CPU_TARGET) - return hpvm::CPU_TARGET; - if (T == hpvm::GPU_TARGET) - return hpvm::CPU_OR_GPU_TARGET; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return visc::CPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::CPU_OR_GPU_TARGET; return T; - case hpvm::GPU_TARGET: - if (T == hpvm::CPU_TARGET) - return hpvm::CPU_OR_GPU_TARGET; - if (T == hpvm::GPU_TARGET) - return hpvm::GPU_TARGET; + case visc::GPU_TARGET: + if (T == visc::CPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::GPU_TARGET; return T; - case hpvm::CPU_OR_GPU_TARGET: - return hpvm::CPU_OR_GPU_TARGET; + case visc::CPU_OR_GPU_TARGET: + return visc::CPU_OR_GPU_TARGET; default: assert(false && "Unknown Target\n"); } return T; } -// This functions add the hint as metadata in hpvm code -void addHint(Function *F, hpvm::Target T) { +// This functions add the hint as metadata in visc code +void addHint(Function *F, visc::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); @@ -434,17 +434,17 @@ void addHint(Function *F, hpvm::Target T) { // Based on the hint, get the hint metadata NamedMDNode *HintNode; switch (T) { - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: DEBUG(errs() << "GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); break; - case hpvm::CPU_TARGET: + case visc::CPU_TARGET: DEBUG(errs() << "CPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); break; - case hpvm::CPU_OR_GPU_TARGET: + case visc::CPU_OR_GPU_TARGET: DEBUG(errs() << "CPU or GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); break; default: llvm_unreachable("Unsupported Target Hint!"); @@ -457,8 +457,8 @@ void addHint(Function *F, hpvm::Target T) { HintNode->addOperand(N); } -// This function removes the hint as metadata in hpvm code -void removeHint(Function *F, hpvm::Target T) { +// This function removes the hint as metadata in visc code +void removeHint(Function *F, visc::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T @@ -467,14 +467,14 @@ void removeHint(Function *F, hpvm::Target T) { // Based on the hint, get the hint metadata NamedMDNode *HintNode; switch (T) { - case hpvm::GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu"); + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); break; - case hpvm::CPU_OR_GPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"); + case visc::CPU_OR_GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); break; - case hpvm::CPU_TARGET: - HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu"); + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); break; default: llvm_unreachable("Unsupported Target Hint!"); @@ -501,7 +501,7 @@ void removeHint(Function *F, hpvm::Target T) { } } -hpvm::Target getPreferredTarget(Function *F) { +visc::Target getPreferredTarget(Function *F) { DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); Module *M = F->getParent(); @@ -517,16 +517,16 @@ hpvm::Target getPreferredTarget(Function *F) { return false; }; - if (FoundPrefTarget("hpvm_hint_cpu")) - return hpvm::CPU_TARGET; - if (FoundPrefTarget("hpvm_hint_gpu")) - return hpvm::GPU_TARGET; - if (FoundPrefTarget("hpvm_hint_cpu_gpu")) - return hpvm::CPU_OR_GPU_TARGET; + if (FoundPrefTarget("visc_hint_cpu")) + return visc::CPU_TARGET; + if (FoundPrefTarget("visc_hint_gpu")) + return visc::GPU_TARGET; + if (FoundPrefTarget("visc_hint_cpu_gpu")) + return visc::CPU_OR_GPU_TARGET; - return hpvm::None; + return visc::None; } -} // namespace hpvmUtils +} // namespace viscUtils -#endif // HPVM_UTILS_HEADER +#endif // VISC_UTILS_HEADER diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp index be3e6cae3d..058419f1dc 100644 --- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -10,8 +10,8 @@ #define DEBUG_TYPE "buildDFG" #include "BuildDFG/BuildDFG.h" -#include "SupportHPVM/HPVMHint.h" -#include "SupportHPVM/HPVMUtils.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/ValueSymbolTable.h" @@ -35,7 +35,7 @@ bool BuildDFG::runOnModule(Module &M) { for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isHPVMLaunchIntrinsic(I)) { + if (isViscLaunchIntrinsic(I)) { DEBUG(errs() << "------------ Found launch site --------------\n"); II = cast<IntrinsicInst>(I); @@ -43,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) { // Intrinsic Instruction has been initialized from this point on. Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts()); - Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F)); + Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F)); Roots.push_back(Root); BuildGraph(Root, F); @@ -118,37 +118,37 @@ void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) { HandleToDFEdgeMap.erase(V); } -// Returns true if instruction I is a hpvm launch intrinsic, false otherwise -bool BuildDFG::isHPVMLaunchIntrinsic(Instruction *I) { +// Returns true if instruction I is a visc launch intrinsic, false otherwise +bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).equals("llvm.hpvm.launch"); + return (II->getCalledFunction()->getName()).equals("llvm.visc.launch"); } -// Returns true if instruction I is a hpvm graph intrinsic, false otherwise -bool BuildDFG::isHPVMGraphIntrinsic(Instruction *I) { +// Returns true if instruction I is a visc graph intrinsic, false otherwise +bool BuildDFG::isViscGraphIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.create") || - (II->getCalledFunction()->getName()).startswith("llvm.hpvm.bind"); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") || + (II->getCalledFunction()->getName()).startswith("llvm.visc.bind"); } -// Returns true if instruction I is a hpvm query intrinsic, false otherwise -bool BuildDFG::isHPVMQueryIntrinsic(Instruction *I) { +// Returns true if instruction I is a visc query intrinsic, false otherwise +bool BuildDFG::isViscQueryIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.get"); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.get"); } -// Returns true if instruction I is a hpvm intrinsic, false otherwise -bool BuildDFG::isHPVMIntrinsic(Instruction *I) { +// Returns true if instruction I is a visc intrinsic, false otherwise +bool BuildDFG::isViscIntrinsic(Instruction *I) { if (!isa<IntrinsicInst>(I)) return false; IntrinsicInst *II = cast<IntrinsicInst>(I); - return (II->getCalledFunction()->getName()).startswith("llvm.hpvm"); + return (II->getCalledFunction()->getName()).startswith("llvm.visc"); } // Two types are "congruent" if they are identical, or if they are both @@ -163,7 +163,7 @@ bool BuildDFG::isTypeCongruent(Type *L, Type *R) { return PL->getAddressSpace() == PR->getAddressSpace(); } -// Handles all the createNodeXX hpvm intrinsics. +// Handles all the createNodeXX visc intrinsics. void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { bool isInternalNode = false; @@ -173,7 +173,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { // internal node for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction - if (isHPVMGraphIntrinsic(I)) + if (isViscGraphIntrinsic(I)) isInternalNode = true; } @@ -196,14 +196,14 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { // Create Internal DFNode, add it to the map and recursively build its // dataflow graph DFInternalNode *childDFNode = DFInternalNode::Create( - II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; BuildGraph(childDFNode, F); } else { // Create Leaf DFnode and add it to the map. DFLeafNode *childDFNode = DFLeafNode::Create( - II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; } @@ -336,11 +336,11 @@ void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) { void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { DEBUG(errs() << "FUNCTION: " << F->getName() << "\n"); - // TODO: Place checks for valid hpvm functions. For example one of the - // check can be that any function that contains hpvm dataflow graph + // TODO: Place checks for valid visc functions. For example one of the + // check can be that any function that contains visc dataflow graph // construction intrinsics should not have other llvm IR statements. - // Iterate over all the instructions of a function and look for hpvm + // Iterate over all the instructions of a function and look for visc // intrinsics. for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &*i; // Grab pointer to Instruction @@ -349,25 +349,25 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName() << "\n"); switch (II->getIntrinsicID()) { - case Intrinsic::hpvm_createNode: - case Intrinsic::hpvm_createNode1D: - case Intrinsic::hpvm_createNode2D: - case Intrinsic::hpvm_createNode3D: + case Intrinsic::visc_createNode: + case Intrinsic::visc_createNode1D: + case Intrinsic::visc_createNode2D: + case Intrinsic::visc_createNode3D: handleCreateNode(N, II); break; - case Intrinsic::hpvm_createEdge: + case Intrinsic::visc_createEdge: handleCreateEdge(N, II); break; - case Intrinsic::hpvm_bind_input: + case Intrinsic::visc_bind_input: handleBindInput(N, II); break; - case Intrinsic::hpvm_bind_output: + case Intrinsic::visc_bind_output: handleBindOutput(N, II); break; // TODO: Reconsider launch within a dataflow graph (recursion?) - case Intrinsic::hpvm_wait: - case Intrinsic::hpvm_launch: + case Intrinsic::visc_wait: + case Intrinsic::visc_launch: DEBUG(errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n"); @@ -375,7 +375,7 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) { default: DEBUG( - errs() << "Error: Invalid HPVM Intrinsic inside Internal node!\n\t" + errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n"); break; } diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt index 5c9b8b9fe0..68724684e5 100644 --- a/hpvm/lib/Transforms/CMakeLists.txt +++ b/hpvm/lib/Transforms/CMakeLists.txt @@ -2,5 +2,5 @@ add_subdirectory(BuildDFG) add_subdirectory(ClearDFG) add_subdirectory(DFG2LLVM_NVPTX) add_subdirectory(DFG2LLVM_X86) -add_subdirectory(GenHPVM) +add_subdirectory(GenVISC) add_subdirectory(LocalMem) diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp index c23043e782..6dae9e6977 100644 --- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp +++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp @@ -18,7 +18,7 @@ using namespace llvm; using namespace builddfg; -// STATISTIC(IntrinsicCounter, "Counts number of hpvm intrinsics greeted"); +// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); namespace { @@ -101,8 +101,8 @@ bool ClearDFG::runOnModule(Module &M) { // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - Function *VI = M.getFunction("llvm.hpvm.init"); - assert(VI->hasOneUse() && "More than one use of llvm.hpvm.init\n"); + Function *VI = M.getFunction("llvm.visc.init"); + assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n"); for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) { Instruction *I = dyn_cast<Instruction>(*ui); @@ -111,8 +111,8 @@ bool ClearDFG::runOnModule(Module &M) { VI->replaceAllUsesWith(UndefValue::get(VI->getType())); VI->eraseFromParent(); - Function *VC = M.getFunction("llvm.hpvm.cleanup"); - assert(VC->hasOneUse() && "More than one use of llvm.hpvm.cleanup\n"); + Function *VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n"); for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) { Instruction *I = dyn_cast<Instruction>(*ui); diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 584da07e6e..8a36e3b8af 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -15,39 +15,40 @@ #define SHARED_ADDRSPACE 3 #define DEBUG_TYPE "DFG2LLVM_NVPTX" -#include "SupportHPVM/DFG2LLVM.h" -#include "SupportHPVM/HPVMTimer.h" -#include "SupportHPVM/HPVMUtils.h" -#include "llvm-c/Core.h" -#include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "SupportVISC/VISCTimer.h" +#include "SupportVISC/DFG2LLVM.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/UseListOrder.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/IR/UseListOrder.h" + #include <sstream> using namespace llvm; using namespace builddfg; using namespace dfg2llvm; -using namespace hpvmUtils; +using namespace viscUtils; -// HPVM Command line option to use timer or not -static cl::opt<bool> HPVMTimer_NVPTX("hpvm-timers-ptx", - cl::desc("Enable hpvm timers")); +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers")); namespace { // Helper class declarations @@ -56,88 +57,94 @@ namespace { // in bytes. Would have preferred to use tuple but support not yet available class OutputPtr { public: - OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes) - : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} - Value *h_ptr; - Value *d_ptr; - Value *bytes; + Value* h_ptr; + Value* d_ptr; + Value* bytes; }; // Class to maintain important kernel info required for generating runtime // calls class Kernel { public: - Kernel( - Function *_KF, DFLeafNode *_KLeafNode, - std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(), - std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap = - std::map<unsigned, std::pair<Value *, unsigned>>(), - std::vector<unsigned> _outArgMap = std::vector<unsigned>(), - unsigned _gridDim = 0, - std::vector<Value *> _globalWGSize = std::vector<Value *>(), - unsigned _blockDim = 0, - std::vector<Value *> _localWGSize = std::vector<Value *>()) - : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), - sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), - gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), - localWGSize(_localWGSize) { - - assert(gridDim == globalWGSize.size() && - "gridDim should be same as the size of vector globalWGSize"); - assert(blockDim == localWGSize.size() && - "blockDim should be same as the size of vector localWGSize"); + Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = + std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = + std::map<unsigned, std::pair<Value*, unsigned> >(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), + unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), + globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); } - Function *KernelFunction; - DFLeafNode *KernelLeafNode; + Function* KernelFunction; + DFLeafNode* KernelLeafNode; std::map<unsigned, unsigned> inArgMap; // Map for shared memory arguments - std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap; + std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; // Fields for (potential) allocation node - DFLeafNode *AllocationNode; - Function *AllocationFunction; + DFLeafNode* AllocationNode; + Function* AllocationFunction; std::map<unsigned, unsigned> allocInArgMap; std::vector<unsigned> outArgMap; unsigned gridDim; - std::vector<Value *> globalWGSize; + std::vector<Value*> globalWGSize; unsigned blockDim; - std::vector<Value *> localWGSize; + std::vector<Value*> localWGSize; std::vector<int> localDimMap; - std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; } - void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; } + std::map<unsigned, unsigned> &getInArgMap() { + return inArgMap; + } + void setInArgMap(std::map<unsigned, unsigned> map) { + inArgMap = map; + } - std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() { + std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() { return sharedInArgMap; } - void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) { + void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { sharedInArgMap = map; } - std::vector<unsigned> &getOutArgMap() { return outArgMap; } - void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; } + std::vector<unsigned> &getOutArgMap() { + return outArgMap; + } + void setOutArgMap(std::vector<unsigned> map) { + outArgMap = map; + } - void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; } + void setLocalWGSize(std::vector<Value*> V) { + localWGSize = V; + } - bool hasLocalWG() const { return blockDim != 0; } + bool hasLocalWG() const { + return blockDim != 0; + } }; // Helper function declarations -static bool canBePromoted(Argument *arg, Function *F); -static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&, - Kernel *, ValueToValueMapTy &, Instruction *); -static Value *genWorkGroupPtr(Module &M, std::vector<Value *>, - ValueToValueMapTy &, Instruction *, - const Twine &WGName = "WGSize"); -static std::string getPTXFilename(const Module &); -static std::string getFilenameFromModule(const Module &M); +static bool canBePromoted(Argument* arg, Function* F); +static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, + ValueToValueMapTy&, Instruction*); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, + Instruction*, const Twine& WGName = "WGSize"); +static std::string getPTXFilename(const Module&); +static std::string getFilenameFromModule(const Module& M); static void changeDataLayout(Module &); static void changeTargetTriple(Module &); static void findReturnInst(Function *, std::vector<ReturnInst *> &); -static void findIntrinsicInst(Function *, Intrinsic::ID, - std::vector<IntrinsicInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID); static std::string getAtomicOpName(Intrinsic::ID); @@ -147,6 +154,7 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM { DFG2LLVM_NVPTX() : DFG2LLVM(ID) {} private: + public: bool runOnModule(Module &M); }; @@ -155,60 +163,57 @@ public: class CGT_NVPTX : public CodeGenTraversal { private: - // Member variables + //Member variables std::unique_ptr<Module> KernelM; - DFNode *KernelLaunchNode = NULL; - Kernel *kernel; - - // HPVM Runtime API - FunctionCallee llvm_hpvm_ocl_launch; - FunctionCallee llvm_hpvm_ocl_wait; - FunctionCallee llvm_hpvm_ocl_initContext; - FunctionCallee llvm_hpvm_ocl_clearContext; - FunctionCallee llvm_hpvm_ocl_argument_shared; - FunctionCallee llvm_hpvm_ocl_argument_scalar; - FunctionCallee llvm_hpvm_ocl_argument_ptr; - FunctionCallee llvm_hpvm_ocl_output_ptr; - FunctionCallee llvm_hpvm_ocl_free; - FunctionCallee llvm_hpvm_ocl_getOutput; - FunctionCallee llvm_hpvm_ocl_executeNode; - - // Functions + DFNode* KernelLaunchNode = NULL; + Kernel* kernel; + + // VISC Runtime API + FunctionCallee llvm_visc_ocl_launch; + FunctionCallee llvm_visc_ocl_wait; + FunctionCallee llvm_visc_ocl_initContext; + FunctionCallee llvm_visc_ocl_clearContext; + FunctionCallee llvm_visc_ocl_argument_shared; + FunctionCallee llvm_visc_ocl_argument_scalar; + FunctionCallee llvm_visc_ocl_argument_ptr; + FunctionCallee llvm_visc_ocl_output_ptr; + FunctionCallee llvm_visc_ocl_free; + FunctionCallee llvm_visc_ocl_getOutput; + FunctionCallee llvm_visc_ocl_executeNode; + + //Functions std::string getKernelsModuleName(Module &M); - void fixValueAddrspace(Value *V, unsigned addrspace); - std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *, - Function *); - Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags, - unsigned i); - void addCLMetadata(Function *F); - Function *transformFunctionToVoid(Function *F); - void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName); + void fixValueAddrspace(Value* V, unsigned addrspace); + std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*); + Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); + void addCLMetadata(Function* F); + Function* transformFunctionToVoid(Function* F); + void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); // Virtual Functions void init() { - HPVMTimer = HPVMTimer_NVPTX; + VISCTimer = VISCTimer_NVPTX; TargetName = "NVPTX"; } void initRuntimeAPI(); - void codeGen(DFInternalNode *N); - void codeGen(DFLeafNode *N); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); public: + // Constructor - CGT_NVPTX(Module &_M, BuildDFG &_DFG) - : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { + CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { init(); initRuntimeAPI(); - DEBUG(errs() << "Old module pointer: " << &_M << "\n"); - DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n"); + errs() << "Old module pointer: " << &_M << "\n"; + errs() << "New module pointer: " << KernelM.get() << "\n"; - // Copying instead of creating new, in order to preserve required info - // (metadata) Remove functions, global variables and aliases - std::vector<GlobalVariable *> GVVect; + // Copying instead of creating new, in order to preserve required info (metadata) + // Remove functions, global variables and aliases + std::vector<GlobalVariable*> GVVect; for (Module::global_iterator mi = KernelM->global_begin(), - me = KernelM->global_end(); - (mi != me); ++mi) { - GlobalVariable *GV = &*mi; + me = KernelM->global_end(); (mi != me); ++mi) { + GlobalVariable* GV = &*mi; GVVect.push_back(GV); } for (auto *GV : GVVect) { @@ -216,10 +221,10 @@ public: GV->eraseFromParent(); } - std::vector<Function *> FuncVect; - for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); - (mi != me); ++mi) { - Function *F = &*mi; + std::vector<Function*> FuncVect; + for (Module::iterator mi = KernelM->begin(), + me = KernelM->end(); (mi != me); ++mi) { + Function* F = &*mi; FuncVect.push_back(F); } for (auto *F : FuncVect) { @@ -227,11 +232,10 @@ public: F->eraseFromParent(); } - std::vector<GlobalAlias *> GAVect; + std::vector<GlobalAlias*> GAVect; for (Module::alias_iterator mi = KernelM->alias_begin(), - me = KernelM->alias_end(); - (mi != me); ++mi) { - GlobalAlias *GA = &*mi; + me = KernelM->alias_end(); (mi != me); ++mi) { + GlobalAlias* GA = &*mi; GAVect.push_back(GA); } for (auto *GA : GAVect) { @@ -242,69 +246,73 @@ public: changeDataLayout(*KernelM); changeTargetTriple(*KernelM); + DEBUG(errs() << *KernelM); + } void writeKernelsModule(); }; -// Initialize the HPVM runtime API. This makes it easier to insert these calls +// Initialize the VISC runtime API. This makes it easier to insert these calls void CGT_NVPTX::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = - llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc"; + Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if (runtimeModule == nullptr) { + if(runtimeModule == nullptr) { DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); assert(false && "couldn't parse runtime"); - } else - DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); + } + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_hpvm_ocl_launch); - DECLARE(llvm_hpvm_ocl_wait); - DECLARE(llvm_hpvm_ocl_initContext); - DECLARE(llvm_hpvm_ocl_clearContext); - DECLARE(llvm_hpvm_ocl_argument_shared); - DECLARE(llvm_hpvm_ocl_argument_scalar); - DECLARE(llvm_hpvm_ocl_argument_ptr); - DECLARE(llvm_hpvm_ocl_output_ptr); - DECLARE(llvm_hpvm_ocl_free); - DECLARE(llvm_hpvm_ocl_getOutput); - DECLARE(llvm_hpvm_ocl_executeNode); + DECLARE(llvm_visc_ocl_launch); + DECLARE(llvm_visc_ocl_wait); + DECLARE(llvm_visc_ocl_initContext); + DECLARE(llvm_visc_ocl_clearContext); + DECLARE(llvm_visc_ocl_argument_shared); + DECLARE(llvm_visc_ocl_argument_scalar); + DECLARE(llvm_visc_ocl_argument_ptr); + DECLARE(llvm_visc_ocl_output_ptr); + DECLARE(llvm_visc_ocl_free); + DECLARE(llvm_visc_ocl_getOutput); + DECLARE(llvm_visc_ocl_executeNode); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); // Insert init context in main DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n"); - Function *VI = M.getFunction("llvm.hpvm.init"); - assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); InitCall = cast<Instruction>(*VI->user_begin()); initializeTimerSet(InitCall); - switchToTimer(hpvm_TimerID_INIT_CTX, InitCall); - CallInst::Create(llvm_hpvm_ocl_initContext, - ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "", - InitCall); - switchToTimer(hpvm_TimerID_NONE, InitCall); + switchToTimer(visc_TimerID_INIT_CTX, InitCall); + CallInst::Create(llvm_visc_ocl_initContext, + ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), + "", InitCall); + switchToTimer(visc_TimerID_NONE, InitCall); - // Insert print instruction at hpvm exit + // Insert print instruction at visc exit DEBUG(errs() << "Gen Code to print NVPTX Timer\n"); - Function *VC = M.getFunction("llvm.hpvm.cleanup"); + Function* VC = M.getFunction("llvm.visc.cleanup"); DEBUG(errs() << *VC << "\n"); - assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); CleanupCall = cast<Instruction>(*VC->user_begin()); printTimerSet(CleanupCall); + + } // Generate Code to call the kernel @@ -312,37 +320,36 @@ void CGT_NVPTX::initRuntimeAPI() { // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device -void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, - const Twine &FileName) { +void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. - // assert(N->getGenFunc() == NULL && "Code already generated for this node"); +// assert(N->getGenFunc() == NULL && "Code already generated for this node"); - assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && "Code already generated for this node"); // Useful values - Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); - Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); // If kernel struct has not been initialized with kernel function, then fail assert(K != NULL && "No kernel found!!"); DEBUG(errs() << "Generating kernel call code\n"); - Function *F = N->getFuncPointer(); + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function *F_X86; + Function* F_X86; // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; // Create new function with the same type - F_X86 = - Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -355,25 +362,26 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); - ReturnInst *RI = ReturnInst::Create( - M.getContext(), UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); // FIXME: Adding Index and Dim arguments are probably not required except // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do // have those arguments) // Add Index and Dim arguments except for the root node - if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - // Add the generated function info to DFNode - // N->setGenFunc(F_X86, hpvm::CPU_TARGET); - N->addGenFunc(F_X86, hpvm::GPU_TARGET, true); - DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " - << N->getFuncPointer()->getName() << "\n"); + //Add the generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::GPU_TARGET, true); + errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " + << N->getFuncPointer()->getName() << "\n"; + // Loop over the arguments, to create the VMap dest_iterator = F_X86->arg_begin(); @@ -406,53 +414,51 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, break; } - assert(C->isDummyNode() == false && "Internal Node only contains dummy - nodes!"); + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); Function* CF = C->getFuncPointer(); */ - Function *KF = K->KernelLeafNode->getFuncPointer(); + Function* KF = K->KernelLeafNode->getFuncPointer(); // Initialize context - // DEBUG(errs() << "Initializing context" << "\n"); - // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI); + //DEBUG(errs() << "Initializing context" << "\n"); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); - DEBUG(errs() << "Initializing commandQ" - << "\n"); + DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue - switchToTimer(hpvm_TimerID_SETUP, InitCall); - Value *fileStr = getStringPointer(FileName, InitCall, "Filename"); + switchToTimer(visc_TimerID_SETUP, InitCall); + Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); - DEBUG(errs() << "Generating code for kernel - " - << K->KernelFunction->getName() << "\n"); - Value *kernelStr = - getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName"); - - Value *LaunchInstArgs[] = {fileStr, kernelStr}; - - DEBUG(errs() << "Inserting launch call" - << "\n"); - CallInst *NVPTX_Ctx = CallInst::Create(llvm_hpvm_ocl_launch, - ArrayRef<Value *>(LaunchInstArgs, 2), - "graph" + KF->getName(), InitCall); + DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); + Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); + + Value* LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" << "\n"); + CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+KF->getName(), + InitCall); DEBUG(errs() << *NVPTX_Ctx << "\n"); - GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false, + GraphIDAddr = new GlobalVariable(M, + NVPTX_Ctx->getType(), + false, GlobalValue::CommonLinkage, Constant::getNullValue(NVPTX_Ctx->getType()), - "graph" + KF->getName() + ".addr"); + "graph"+KF->getName()+".addr"); DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); - StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); + StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); DEBUG(errs() << *SI << "\n"); - switchToTimer(hpvm_TimerID_NONE, InitCall); - switchToTimer(hpvm_TimerID_SETUP, RI); - Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI); + switchToTimer(visc_TimerID_NONE, InitCall); + switchToTimer(visc_TimerID_SETUP, RI); + Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); - // Iterate over the required input edges of the node and use the hpvm-rt API + // Iterate over the required input edges of the node and use the visc-rt API // to set inputs - DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n"); + DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); std::vector<OutputPtr> OutputPointers; - // Vector to hold the device memory object that need to be cleared before we - // release context - std::vector<Value *> DevicePointers; + // Vector to hold the device memory object that need to be cleared before we release + // context + std::vector<Value*> DevicePointers; std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap(); /* @@ -464,134 +470,133 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, */ - for (auto &InArgMapPair : kernelInArgMap) { + for(auto &InArgMapPair : kernelInArgMap) { unsigned i = InArgMapPair.first; - Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second); - DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n"); + Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second); + DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); // input value has been obtained. // Check if input is a scalar value or a pointer operand // For scalar values such as int, float, etc. the size is simply the size of // type on target machine, but for pointers, the size of data would be the // next integer argument - if (inputVal->getType()->isPointerTy()) { + if(inputVal->getType()->isPointerTy()) { - switchToTimer(hpvm_TimerID_COPY_PTR, RI); + switchToTimer(visc_TimerID_COPY_PTR, RI); // Pointer Input // CheckAttribute - Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False; - Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) && - !(hasAttribute(KF, i, Attribute::In))) - ? False - : True; - - Argument *A = getArgumentAt(KF, i); - if (isOutput == True) { + Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) + && !(hasAttribute(KF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(KF, i); + if(isOutput == True) { DEBUG(errs() << *A << " is an OUTPUT argument\n"); } - if (isInput == True) { + if(isInput == True) { DEBUG(errs() << *A << " is an INPUT argument\n"); } - Value *inputValI8Ptr = CastInst::CreatePointerCast( - inputVal, Type::getInt8PtrTy(M.getContext()), - inputVal->getName() + ".i8ptr", RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); // Assert that the pointer argument size (next argument) is in the map - assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end()); - - Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]); - assert( - inputSize->getType() == Type::getInt64Ty(M.getContext()) && - "Pointer type input must always be followed by size (integer type)"); - Value *setInputArgs[] = { - GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), i), - inputSize, - isInput, - isOutput}; - Value *d_ptr = - CallInst::Create(llvm_hpvm_ocl_argument_ptr, - ArrayRef<Value *>(setInputArgs, 6), "", RI); + assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); + + Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); + assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize, + isInput, + isOutput + }; + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in // memory to read device memory later - if (isOutput == True) - OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); - } else { - switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } + else { + switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Scalar Input // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *inputValPtr = new AllocaInst( - inputVal->getType(), 0, inputVal->getName() + ".ptr", RI); - StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI); - - Value *inputValI8Ptr = CastInst::CreatePointerCast( - inputValPtr, Type::getInt8PtrTy(M.getContext()), - inputVal->getName() + ".i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), i), - ConstantExpr::getSizeOf(inputVal->getType())}; - CallInst::Create(llvm_hpvm_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } - DEBUG( - errs() << "Setup shared memory arguments of node and insert hpvm api\n"); + DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); // Check to see if all the allocation sizes are constant (determined // statically) bool constSizes = true; - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { constSizes &= isa<Constant>(e.second.first); } // If the sizes are all constant if (constSizes) { - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { unsigned argNum = e.first; - Value *allocSize = e.second.first; + Value* allocSize = e.second.first; - DEBUG(errs() << "\tLocal Memory at " << argNum - << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position - switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); + switchToTimer(visc_TimerID_COPY_SCALAR, RI); - assert(isa<Constant>(allocSize) && - "Constant shared memory size is expected"); + assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); - Value *setInputArgs[] = { - GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - allocSize}; - CallInst::Create(llvm_hpvm_ocl_argument_shared, - ArrayRef<Value *>(setInputArgs, 3), "", RI); - } else { + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { // Sharem memory size argument - scalar at address position - switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); + switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *allocSizePtr = - new AllocaInst(allocSize->getType(), 0, - allocSize->getName() + ".sharedMem.ptr", RI); - StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value *allocSizeI8Ptr = CastInst::CreatePointerCast( - allocSizePtr, Type::getInt8PtrTy(M.getContext()), - allocSize->getName() + ".sharedMem.i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - ConstantExpr::getSizeOf(allocSize->getType())}; - CallInst::Create(llvm_hpvm_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } } else { @@ -612,64 +617,68 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, ExtractValueInstVec.push_back(EI); } - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { unsigned argNum = e.first; - Value *allocSize = ExtractValueInstVec[e.second.second / 2]; + Value* allocSize = ExtractValueInstVec[e.second.second/2]; - DEBUG(errs() << "\tLocal Memory at " << argNum - << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position - switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); - - Value *setInputArgs[] = { - GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - allocSize}; - CallInst::Create(llvm_hpvm_ocl_argument_shared, - ArrayRef<Value *>(setInputArgs, 3), "", RI); - } else { + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { // Sharem memory size argument - scalar at address position - switchToTimer(hpvm_TimerID_COPY_SCALAR, RI); + switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *allocSizePtr = - new AllocaInst(allocSize->getType(), 0, - allocSize->getName() + ".sharedMem.ptr", RI); - StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value *allocSizeI8Ptr = CastInst::CreatePointerCast( - allocSizePtr, Type::getInt8PtrTy(M.getContext()), - allocSize->getName() + ".sharedMem.i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - ConstantExpr::getSizeOf(allocSize->getType())}; - CallInst::Create(llvm_hpvm_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } } - DEBUG(errs() << "Setup output edges of node and insert hpvm api\n"); + + DEBUG(errs() << "Setup output edges of node and insert visc api\n"); // Set output if struct is not an empty struct - StructType *OutputTy = K->KernelLeafNode->getOutputType(); - std::vector<Value *> d_Outputs; - if (!OutputTy->isEmptyTy()) { - switchToTimer(hpvm_TimerID_COPY_PTR, RI); + StructType* OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value*> d_Outputs; + if(!OutputTy->isEmptyTy()) { + switchToTimer(visc_TimerID_COPY_PTR, RI); // Not an empty struct // Iterate over all elements of the struct and put them in - for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { - unsigned outputIndex = KF->getFunctionType()->getNumParams() + i; - Value *setOutputArgs[] = { - GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex), - ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; - - CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr, - ArrayRef<Value *>(setOutputArgs, 3), - "d_output." + KF->getName(), RI); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; + Value* setOutputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + + CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, + ArrayRef<Value*>(setOutputArgs, 3), + "d_output."+KF->getName(), + RI); d_Outputs.push_back(d_Output); } } @@ -679,41 +688,50 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, // Allocate size_t[numDims] space on stack. Store the work group sizes and // pass it as an argument to ExecNode - switchToTimer(hpvm_TimerID_MISC, RI); + switchToTimer(visc_TimerID_MISC, RI); Value *workDim, *LocalWGPtr, *GlobalWGPtr; getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); - switchToTimer(hpvm_TimerID_KERNEL, RI); - Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr}; - CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode, - ArrayRef<Value *>(ExecNodeArgs, 4), - "event." + KF->getName(), RI); + switchToTimer(visc_TimerID_KERNEL, RI); + Value* ExecNodeArgs[] = {GraphID, + workDim, + LocalWGPtr, + GlobalWGPtr + }; + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+KF->getName(), + RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI); + CallInst::Create(llvm_visc_ocl_wait, + ArrayRef<Value*>(GraphID), + "", + RI); - switchToTimer(hpvm_TimerID_READ_OUTPUT, RI); + switchToTimer(visc_TimerID_READ_OUTPUT, RI); // Read Output Struct if not empty - if (!OutputTy->isEmptyTy()) { - std::vector<Value *> h_Outputs; - Value *KernelOutput = UndefValue::get(OutputTy); - for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { - Value *GetOutputArgs[] = { - GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; - CallInst *h_Output = CallInst::Create( - llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4), - "h_output." + KF->getName() + ".addr", RI); + if(!OutputTy->isEmptyTy()) { + std::vector<Value*>h_Outputs; + Value* KernelOutput = UndefValue::get(OutputTy); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "h_output."+KF->getName()+".addr", + RI); // Read each device pointer listed in output struct // Load the output struct - CastInst *BI = BitCastInst::CreatePointerCast( - h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr", - RI); - - Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI); - KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, - ArrayRef<unsigned>(i), - KF->getName() + "output", RI); + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, + OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); + + Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), + KF->getName()+"output", RI); } OutputMap[K->KernelLeafNode] = KernelOutput; } @@ -728,76 +746,75 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); - Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, - output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput, + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ - switchToTimer(hpvm_TimerID_MEM_FREE, RI); + switchToTimer(visc_TimerID_MEM_FREE, RI); // Clear Context and free device memory - DEBUG(errs() << "Clearing context" - << "\n"); + DEBUG(errs() << "Clearing context" << "\n"); // Free Device Memory - for (auto d_ptr : DevicePointers) { - CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI); + for(auto d_ptr: DevicePointers) { + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); } - switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall); + switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); // Clear Context - LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "", - CleanupCall); - switchToTimer(hpvm_TimerID_NONE, CleanupCall); + LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + switchToTimer(visc_TimerID_NONE, CleanupCall); - switchToTimer(hpvm_TimerID_MISC, RI); + switchToTimer(visc_TimerID_MISC, RI); DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode *C = N->getChildGraph()->getExit(); + DFNode* C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType *OutTy = N->getOutputType(); + StructType* OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find the kernel's output arg map, to use instead of the bindings std::vector<unsigned> outArgMap = kernel->getOutArgMap(); // Find all the input edges to exit node - for (unsigned i = 0; i < OutTy->getNumElements(); i++) { + for (unsigned i=0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge *E = C->getInDFEdgeAt(i); + DFEdge* E = C->getInDFEdgeAt(i); assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode *SrcDF = E->getSourceDF(); + DFNode* SrcDF = E->getSourceDF(); - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() - << "\n"); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value *inputVal; - if (SrcDF->isEntryNode()) { + Value* inputVal; + if(SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); - } else { + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { // edge is from a internal node // Check - code should already be generated for this source dfnode // FIXME: Since the 2-level kernel code gen has aspecific structure, we // can assume the SrcDF is same as Kernel Leaf node. // Use outArgMap to get correct mapping SrcDF = K->KernelLeafNode; - assert(OutputMap.count(SrcDF) && - "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); // Find Output Value associated with the Source DFNode using OutputMap - Value *CI = OutputMap[SrcDF]; + Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; // i is the destination of DFEdge E // Use the mapping instead of the bindings - // IndexList.push_back(E->getSourcePosition()); +// IndexList.push_back(E->getSourcePosition()); IndexList.push_back(outArgMap[i]); - DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); - ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -806,33 +823,31 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, } DEBUG(errs() << "Extracted all\n"); - switchToTimer(hpvm_TimerID_NONE, RI); + switchToTimer(visc_TimerID_NONE, RI); retVal->setName("output"); - ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); } + // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them -void CGT_NVPTX::codeGen(DFInternalNode *N) { - DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() - << "\n"); - if (KernelLaunchNode == NULL) - DEBUG(errs() << "No kernel launch node\n"); +void CGT_NVPTX::codeGen(DFInternalNode* N) { + errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"; + if(KernelLaunchNode == NULL) + errs () << "No kernel launch node\n"; else { - DEBUG(errs() << "KernelLaunchNode: " - << KernelLaunchNode->getFuncPointer()->getName() << "\n"); + errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; } if (!KernelLaunchNode) { - DEBUG(errs() - << "No code generated (host code for kernel launch complete).\n"); + DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); return; } if (N == KernelLaunchNode) { DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); - // TODO + //TODO // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; @@ -847,8 +862,7 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // TODO: Structure assumed: one thread node, one allocation node (at most), // TB node std::map<unsigned, unsigned> inmapFinal; - for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), - ie = inmap2.end(); + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); ib != ie; ++ib) { inmapFinal[ib->first] = inmap1[ib->second]; } @@ -865,9 +879,8 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // 0 ... outmap2.size()-1 // The limit is the size of outmap2, because this is the number of kernel // output arguments for which the mapping matters - // For now, it reasonable to assume that all the kernel arguments are - // returned, maybe plys some others from other nodes, thus outmap2.size() <= - // outmap1.size() + // For now, it reasonable to assume that all the kernel arguments are returned, + // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() for (unsigned i = 0; i < outmap2.size(); i++) { outmap1[i] = outmap2[outmap1[i]]; } @@ -875,14 +888,15 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // Track the source of local dimlimits for the kernel // Dimension limit can either be a constant or an argument of parent - // function. Since Internal node would no longer exist, we need to insert - // the localWGSize with values from the parent of N. - std::vector<Value *> localWGSizeMapped; + // function. Since Internal node would no longer exist, we need to insert the + // localWGSize with values from the parent of N. + std::vector<Value*> localWGSizeMapped; for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { if (isa<Constant>(kernel->localWGSize[i])) { // if constant, use as it is localWGSizeMapped.push_back(kernel->localWGSize[i]); - } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + } + else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { // if argument, find the argument location in N. Use InArgMap of N to // find the source location in Parent of N. Retrieve the argument from // parent to insert in the vector. @@ -892,49 +906,46 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); unsigned parentArgNum = N->getInArgMap()[argNum]; - Argument *A = - getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); localWGSizeMapped.push_back(A); - } else { - assert( - false && - "LocalWGsize using value which is neither argument nor constant!"); + } + else { + assert(false && "LocalWGsize using value which is neither argument nor constant!"); } } // Update localWGSize vector of kernel kernel->setLocalWGSize(localWGSizeMapped); } + } -void CGT_NVPTX::codeGen(DFLeafNode *N) { - DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() - << "\n"); +void CGT_NVPTX::codeGen(DFLeafNode* N) { + errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"; // Skip code generation if it is a dummy node - if (N->isDummyNode()) { + if(N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Skip code generation if it is an allocation node - if (N->isAllocationNode()) { + if(N->isAllocationNode()) { DEBUG(errs() << "Skipping allocation node\n"); return; } // Generate code only if it has the right hint - // if(!checkPreferredTarget(N, hpvm::GPU_TARGET)) { - // errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; - // return; - // } - if (!preferredTargetIncludes(N, hpvm::GPU_TARGET)) { - DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName() - << "\n"); +// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { +// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; +// return; +// } + if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; return; } // Checking which node is the kernel launch - DFNode *PNode = N->getParent(); + DFNode* PNode = N->getParent(); int pLevel = PNode->getLevel(); int pReplFactor = PNode->getNumOfDim(); @@ -942,40 +953,42 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // (1) Parent is the top level node i.e., Root of DFG // OR // (2) Parent does not have multiple instances - DEBUG(errs() << "pLevel = " << pLevel << "\n"); - DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n"); + errs() << "pLevel = " << pLevel << "\n"; + errs() << "pReplFactor = " << pReplFactor << "\n"; assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node."); // Only these options are supported - enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy; - if (pLevel == 1 || !pReplFactor) { - DEBUG(errs() - << "*************** Kernel Gen: 1-Level Hierarchy **************\n"); + enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy; + if(pLevel == 1 || !pReplFactor) { + errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"; SelectedHierarchy = ONE_LEVEL; KernelLaunchNode = PNode; - kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(), - N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits()); - } else { + kernel = new Kernel(NULL, + N, + N->getInArgMap(), + N->getSharedInArgMap(), + N->getOutArgMap(), + N->getNumOfDim(), + N->getDimLimits()); + } + else { // Converting a 2-level DFG to opencl kernel - DEBUG(errs() - << "*************** Kernel Gen: 2-Level Hierarchy **************\n"); - assert((pLevel >= 2) && - "Selected node not nested deep enough to be Kernel Node."); + errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"; + assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node."); SelectedHierarchy = TWO_LEVEL; KernelLaunchNode = PNode->getParent(); - assert((PNode->getNumOfDim() == N->getNumOfDim()) && - "Dimension number must match"); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters - kernel = new Kernel(NULL, // kernel function - N, // kernel leaf node - N->getInArgMap(), // kenel argument mapping + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping N->getSharedInArgMap(), - N->getOutArgMap(), // kernel output mapping from the - // leaf to the interemediate node - PNode->getNumOfDim(), // gridDim - PNode->getDimLimits(), // grid size - N->getNumOfDim(), // blockDim - N->getDimLimits()); // block size + N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(),// grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size + } std::vector<Instruction *> IItoRemove; @@ -987,62 +1000,58 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. - // Function *F_nvptx = N->getGenFunc(); - Function *F_nvptx = N->getGenFuncForTarget(hpvm::GPU_TARGET); +// Function *F_nvptx = N->getGenFunc(); + Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); - assert(F_nvptx == NULL && - "Error: Visiting a node for which code already generated"); + assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); // Clone the function ValueToValueMapTy VMap; - // F_nvptx->setName(FName+"_nvptx"); + //F_nvptx->setName(FName+"_nvptx"); Twine FName = F->getName(); StringRef fStr = FName.getSingleStringRef(); - Twine newFName = Twine(fStr, "_nvptx"); + Twine newFName = Twine(fStr, "_nvptx"); F_nvptx = CloneFunction(F, VMap); F_nvptx->setName(newFName); + // errs() << "Old Function Name: " << F->getName() << "\n"; // errs() << "New Function Name: " << F_nvptx->getName() << "\n"; F_nvptx->removeFromParent(); + // Insert the cloned function into the kernels module KernelM->getFunctionList().push_back(F_nvptx); - // TODO: Iterate over all the instructions of F_nvptx and identify the - // callees and clone them into this module. + + //TODO: Iterate over all the instructions of F_nvptx and identify the + //callees and clone them into this module. DEBUG(errs() << *F_nvptx->getType()); DEBUG(errs() << *F_nvptx); // Transform the function to void and remove all target dependent attributes // from the function F_nvptx = transformFunctionToVoid(F_nvptx); + + //Add generated function info to DFNode +// N->setGenFunc(F_nvptx, visc::GPU_TARGET); + N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); - // Add generated function info to DFNode - // N->setGenFunc(F_nvptx, hpvm::GPU_TARGET); - N->addGenFunc(F_nvptx, hpvm::GPU_TARGET, false); - - DEBUG( - errs() - << "Removing all attributes from Kernel Function and adding nounwind\n"); - F_nvptx->removeAttributes(AttributeList::FunctionIndex, - F_nvptx->getAttributes().getFnAttributes()); + DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); - // FIXME: For now, assume only one allocation node + //FIXME: For now, assume only one allocation node kernel->AllocationNode = NULL; - for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), - iee = N->indfedge_end(); + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); ieb != iee; ++ieb) { DFNode *SrcDFNode = (*ieb)->getSourceDF(); - DEBUG(errs() << "Found edge from node: " - << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode() - << "\n"); + DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); if (!SrcDFNode->isDummyNode()) { assert(SrcDFNode->isAllocationNode()); kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); @@ -1056,22 +1065,19 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // If no allocation node was found, SharedMemArgs is empty if (kernel->AllocationNode) { - ValueToValueMapTy VMap; - Function *F_alloc = - CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); - // F_alloc->removeFromParent(); + Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + //F_alloc->removeFromParent(); // Insert the cloned function into the kernels module - // M.getFunctionList().push_back(F_alloc); + //M.getFunctionList().push_back(F_alloc); - std::vector<IntrinsicInst *> HPVMMallocInstVec; - findIntrinsicInst(F_alloc, Intrinsic::hpvm_malloc, HPVMMallocInstVec); + std::vector<IntrinsicInst *> ViscMallocInstVec; + findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); - for (unsigned i = 0; i < HPVMMallocInstVec.size(); i++) { - IntrinsicInst *II = HPVMMallocInstVec[i]; - assert(II->hasOneUse() && "hpvm_malloc result is used more than once"); - II->replaceAllUsesWith( - ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { + IntrinsicInst *II = ViscMallocInstVec[i]; + assert(II->hasOneUse() && "visc_malloc result is used more than once"); + II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); II->eraseFromParent(); } kernel->AllocationFunction = F_alloc; @@ -1086,19 +1092,15 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { assert(RetStructTy && "Allocation node does not return a struct type"); unsigned numFields = RetStructTy->getNumElements(); */ - std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap = - kernel->getSharedInArgMap(); - AllocationNodeProperty *APN = - (AllocationNodeProperty *)kernel->AllocationNode->getProperty( - DFNode::Allocation); - for (auto &AllocPair : APN->getAllocationList()) { + std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); + AllocationNodeProperty* APN = + (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); + for (auto& AllocPair: APN->getAllocationList()) { unsigned destPos = AllocPair.first->getDestPosition(); unsigned srcPos = AllocPair.first->getSourcePosition(); SharedMemArgs.push_back(destPos); - sharedInMap[destPos] = - std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); - sharedInMap[destPos + 1] = - std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); + sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); } kernel->setSharedInArgMap(sharedInMap); } @@ -1108,14 +1110,12 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // global address space unsigned argIndex = 0; std::vector<unsigned> GlobalMemArgs; - for (Function::arg_iterator ai = F_nvptx->arg_begin(), - ae = F_nvptx->arg_end(); - ai != ae; ++ai) { - if (ai->getType()->isPointerTy()) { - // If the arguement is already chosen for shared memory arguemnt list, - // skip. Else put it in Global memory arguement list - if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == - 0) { + for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end(); + ai != ae; ++ai) { + if (ai->getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, skip. + // Else put it in Global memory arguement list + if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { GlobalMemArgs.push_back(argIndex); } } @@ -1129,21 +1129,20 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // Optimization: Gloabl memory arguments, which are not modified and whose // loads are not dependent on node id of current node, should be moved to // constant memory, subject to size of course - std::vector<unsigned> ConstantMemArgs = - globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); + std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); - // Function to replace call instructions to functions in the kernel +// Function to replace call instructions to functions in the kernel std::map<Function *, Function *> OrgToClonedFuncMap; std::vector<Function *> FuncToBeRemoved; - auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) { - Function *NewFunc; + auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) { + Function* NewFunc; // Check if the called function has already been cloned before. auto It = OrgToClonedFuncMap.find(OrgFunc); - if (It == OrgToClonedFuncMap.end()) { + if(It == OrgToClonedFuncMap.end()) { ValueToValueMapTy VMap; NewFunc = CloneFunction(OrgFunc, VMap); OrgToClonedFuncMap[OrgFunc] = NewFunc; @@ -1152,48 +1151,43 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { NewFunc = (*It).second; } // Replace the calls to this function - std::vector<Value *> args; - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + std::vector<Value*> args; + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { args.push_back(CI->getArgOperand(i)); } - CallInst *Inst = CallInst::Create( - NewFunc, args, - OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); + CallInst* Inst = CallInst::Create(NewFunc, args, + OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI); CI->replaceAllUsesWith(Inst); IItoRemove.push_back(CI); return NewFunc; }; + // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; - ++i) { + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { Instruction *I = &(*i); - // Leaf nodes should not contain HPVM graph intrinsics or launch - assert(!BuildDFG::isHPVMLaunchIntrinsic(I) && - "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isHPVMGraphIntrinsic(I) && - "HPVM graph intrinsic within a leaf dataflow node!"); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); - if (BuildDFG::isHPVMIntrinsic(I)) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - IntrinsicInst *ArgII; - DFNode *ArgDFNode; + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; - /************************ Handle HPVM Query intrinsics - * ************************/ + /************************ Handle VISC Query intrinsics ************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.hpvm.getNode() - * *****************************/ - case Intrinsic::hpvm_getNode: { + /**************************** llvm.visc.getNode() *****************************/ + case Intrinsic::visc_getNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); - } break; - /************************* llvm.hpvm.getParentNode() - * **************************/ - case Intrinsic::hpvm_getParentNode: { + } + break; + /************************* llvm.visc.getParentNode() **************************/ + case Intrinsic::visc_getParentNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); // get the parent node of the arg node // get argument node @@ -1206,10 +1200,10 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); IItoRemove.push_back(II); - } break; - /*************************** llvm.hpvm.getNumDims() - * ***************************/ - case Intrinsic::hpvm_getNumDims: { + } + break; + /*************************** llvm.visc.getNumDims() ***************************/ + case Intrinsic::visc_getNumDims: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); // get node from map // get the appropriate field @@ -1217,48 +1211,47 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; int numOfDim = ArgDFNode->getNumOfDim(); DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); - IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext()); - ConstantInt *numOfDimConstant = - ConstantInt::getSigned(IntTy, (int64_t)numOfDim); + IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); // Replace the result of the intrinsic with the computed value II->replaceAllUsesWith(numOfDimConstant); IItoRemove.push_back(II); - } break; - /*********************** llvm.hpvm.getNodeInstanceID() - * ************************/ - case Intrinsic::hpvm_getNodeInstanceID_x: - case Intrinsic::hpvm_getNodeInstanceID_y: - case Intrinsic::hpvm_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" - << "\t: " << *II << "\n"); + } + break; + /*********************** llvm.visc.getNodeInstanceID() ************************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; assert(ArgDFNode && "Arg node is NULL"); // A leaf node always has a parent - DFNode *ParentDFNode = ArgDFNode->getParent(); + DFNode* ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = - II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x; + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt *DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - // ArrayRef<Value *> Args(DimConstant); + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function *OpenCLFunction; + Function * OpenCLFunction; - FunctionType *FT = - FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), false); + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), + false); if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel @@ -1267,867 +1260,838 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // itself DEBUG(errs() << "Substitute with get_global_id()\n"); DEBUG(errs() << *II << "\n"); - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { - // DEBUG(errs() << "Here inside cond 2\n"); + //DEBUG(errs() << "Here inside cond 2\n"); // We are asking for this node's id with respect to its parent // this is a local id call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)) - .getCallee()); - // DEBUG(errs() << "exiting condition 2\n"); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee()); + //DEBUG(errs() << "exiting condition 2\n"); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's id with respect to its // parent: this is a group id call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee()); } else { - DEBUG(errs() << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << *II << "\n"); + errs() << N->getFuncPointer()->getName() << "\n"; + errs() << N->getParent()->getFuncPointer()->getName() << "\n"; + errs() << *II << "\n"; assert(false && "Unable to translate getNodeInstanceID intrinsic"); } - // DEBUG(errs() << "Create call instruction, insert it before the - // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction << - // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); - // DEBUG(errs() << "Argument: " << Args[0] << "\n"); - // DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); + //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n"); + //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n"); + //DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); + //DEBUG(errs() << "Argument: " << Args[0] << "\n"); + //DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); - // DEBUG(errs() << "Replace uses\n"); + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + //DEBUG(errs() << "Replace uses\n"); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } break; - /********************** llvm.hpvm.getNumNodeInstances() - * ***********************/ - case Intrinsic::hpvm_getNumNodeInstances_x: - case Intrinsic::hpvm_getNumNodeInstances_y: - case Intrinsic::hpvm_getNumNodeInstances_z: { + } + break; + /********************** llvm.visc.getNumNodeInstances() ***********************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { // TODO: think about whether this is the best way to go there are hw // specific registers. therefore it is good to have the intrinsic but // then, why do we need to keep that info in the graph? (only for the // kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() - << "\t: Handling getNumNodeInstances\n"); + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; // A leaf node always has a parent - DFNode *ParentDFNode = ArgDFNode->getParent(); + DFNode* ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = - II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x; + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt *DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - // ArrayRef<Value *> Args(DimConstant); + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function *OpenCLFunction; - FunctionType *FT = + Function * OpenCLFunction; + FunctionType* FT = FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), false); + Type::getInt32Ty(KernelM->getContext()), + false); if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel // launch, so the instances are global_size (gridDim x blockDim) - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's instances // this is a local size (block dim) call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's instances // this is a (global_size/local_size) (grid dim) call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee()); } else { assert(false && "Unable to translate getNumNodeInstances intrinsic"); } // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } break; - case Intrinsic::hpvm_barrier: { + } + break; + case Intrinsic::visc_barrier: + { DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); DEBUG(errs() << "Substitute with barrier()\n"); DEBUG(errs() << *II << "\n"); - FunctionType *FT = FunctionType::get( - Type::getVoidTy(KernelM->getContext()), - std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())), - false); - Function *OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("barrier"), FT)) - .getCallee()); - CallInst *CI = - CallInst::Create(OpenCLFunction, - ArrayRef<Value *>(ConstantInt::get( - Type::getInt32Ty(KernelM->getContext()), 1)), - "", II); + FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), + std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function* OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee()); + CallInst* CI = CallInst::Create(OpenCLFunction, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } break; - case Intrinsic::hpvm_atomic_cmpxchg: - break; - case Intrinsic::hpvm_atomic_add: - case Intrinsic::hpvm_atomic_sub: - case Intrinsic::hpvm_atomic_xchg: - case Intrinsic::hpvm_atomic_min: - case Intrinsic::hpvm_atomic_max: - case Intrinsic::hpvm_atomic_and: - case Intrinsic::hpvm_atomic_or: - case Intrinsic::hpvm_atomic_xor: - // case Intrinsic::hpvm_atomic_inc: - // case Intrinsic::hpvm_atomic_dec: - { - DEBUG(errs() << *II << "\n"); - // Only have support for i32 atomic intrinsics - assert(II->getType() == Type::getInt32Ty(II->getContext()) && - "Only support i32 atomic intrinsics for now"); - // Substitute with atomicrmw instruction - assert(II->getNumArgOperands() == 2 && - "Expecting 2 operands for these atomics"); - Value *Ptr = II->getArgOperand(0); - Value *Val = II->getArgOperand(1); - assert(Ptr->getType()->isPointerTy() && - "First argument of supported atomics is expected to be a " - "pointer"); - PointerType *PtrTy = cast<PointerType>(Ptr->getType()); - PointerType *TargetTy = - Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); - if (PtrTy != TargetTy) { - Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); - PtrTy = TargetTy; - } - - std::string name; - if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_add) - name = "atomic_add"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_sub) - name = "atomic_sub"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xchg) - name = "atomic_xchg"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_min) - name = "atomic_min"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_max) - name = "atomic_max"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_and) - name = "atomic_and"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_or) - name = "atomic_or"; - else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xor) - name = "atomic_xor"; - Type *paramTypes[] = {PtrTy, Val->getType()}; - FunctionType *AtomFuncT = FunctionType::get( - II->getType(), ArrayRef<Type *>(paramTypes, 2), false); - FunctionCallee AtomFunc = - KernelM->getOrInsertFunction(name, AtomFuncT); - - Value *Params[] = {Ptr, Val}; - CallInst *AtomCI = CallInst::Create( - AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II); - DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); - II->replaceAllUsesWith(AtomCI); - IItoRemove.push_back(II); - } - break; - default: - llvm_unreachable("Unknown HPVM Intrinsic!"); - break; - } - - } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { - IRBuilder<> Builder(I); - Value *Source = MemCpyI->getSource(); - Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); - Value *Length = MemCpyI->getOperand(2); - DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); - DEBUG(errs() << "Source: " << *Source << "\n"); - DEBUG(errs() << "Destination: " << *Destination << "\n"); - DEBUG(errs() << "Length: " << *Length << "\n"); - - size_t memcpy_length; - unsigned int memcpy_count; - if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) { - if (CI->getBitWidth() <= 64) { - memcpy_length = CI->getSExtValue(); - DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); - Type *Source_Type = Source->getType()->getPointerElementType(); - DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); - memcpy_count = - memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); - DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); - if (GetElementPtrInst *sourceGEPI = - dyn_cast<GetElementPtrInst>(Source)) { - if (GetElementPtrInst *destGEPI = - dyn_cast<GetElementPtrInst>(Destination)) { - Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); - Value *DestPtrOperand = destGEPI->getPointerOperand(); - for (int i = 0; i < memcpy_count; ++i) { - Constant *increment; - LoadInst *newLoadI; - StoreInst *newStoreI; - // First, need to increment the correct index for both source - // and dest This invluves checking to see how many indeces the - // GEP has Assume for now only 1 or 2 are the viable options. - - std::vector<Value *> GEPlIndex; - if (sourceGEPI->getNumIndices() == 1) { - Value *Index = sourceGEPI->getOperand(1); - increment = ConstantInt::get(Index->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPlIndex.push_back(incAdd); - Value *newGEPIl = Builder.CreateGEP( - SourcePtrOperand, ArrayRef<Value *>(GEPlIndex)); - DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); - newLoadI = Builder.CreateLoad(newGEPIl); - DEBUG(errs() << "Load: " << *newLoadI << "\n"); - } else { - llvm_unreachable("Unhandled case where source GEPI has more " - "than 1 indices!\n"); - } - - std::vector<Value *> GEPsIndex; - if (destGEPI->getNumIndices() == 1) { - - } else if (destGEPI->getNumIndices() == 2) { - Value *Index0 = destGEPI->getOperand(1); - GEPsIndex.push_back(Index0); - Value *Index1 = destGEPI->getOperand(2); - increment = ConstantInt::get(Index1->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index1, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPsIndex.push_back(incAdd); - Value *newGEPIs = Builder.CreateGEP( - DestPtrOperand, ArrayRef<Value *>(GEPsIndex)); - DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); - newStoreI = Builder.CreateStore(newLoadI, newGEPIs, - MemCpyI->isVolatile()); - DEBUG(errs() << "Store: " << *newStoreI << "\n"); - } else { - llvm_unreachable("Unhandled case where dest GEPI has more " - "than 2 indices!\n"); - } - } - IItoRemove.push_back(sourceGEPI); - IItoRemove.push_back(destGEPI); - Instruction *destBitcastI = - dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); - Instruction *sourceBitcastI = - dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); - IItoRemove.push_back(destBitcastI); - IItoRemove.push_back(sourceBitcastI); - IItoRemove.push_back(MemCpyI); - } - } - } - } else { - llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); } - // llvm_unreachable("HERE!"); - } - - else if (CallInst *CI = dyn_cast<CallInst>(I)) { - DEBUG(errs() << "Found a call: " << *CI << "\n"); - Function *calleeF = - cast<Function>(CI->getCalledValue()->stripPointerCasts()); - if (calleeF->isDeclaration()) { - // Add the declaration to kernel module - if (calleeF->getName() == "sqrtf") { - calleeF->setName(Twine("sqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } else if (calleeF->getName() == "rsqrtf") { - calleeF->setName(Twine("rsqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } - DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF - << "\n"); - KernelM->getOrInsertFunction(calleeF->getName(), - calleeF->getFunctionType()); - } else { - // Check if the called function has already been cloned before. - Function *NewFunc = CloneAndReplaceCall(CI, calleeF); - // Iterate over the new function to see if it calls any other functions - // in the module. - for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); - i != e; ++i) { - if (auto *Call = dyn_cast<CallInst>(&*i)) { - Function *CalledFunc = - cast<Function>(Call->getCalledValue()->stripPointerCasts()); - CloneAndReplaceCall(Call, CalledFunc); - } - } - } - // TODO: how to handle address space qualifiers in load/store - } - } - // search for pattern where float is being casted to int and loaded/stored and - // change it. - DEBUG(errs() << "finding pattern for replacement!\n"); - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; - ++i) { - bool cont = false; - bool keepGEPI = false; - bool keepGEPI2 = false; - Instruction *I = &(*i); - GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); - - if (!GEPI) { - // did nod find pattern start, continue - continue; - } - // may have found pattern, check - DEBUG(errs() << "GEPI " << *GEPI << "\n"); - // print whatever we want for debug - Value *PtrOp = GEPI->getPointerOperand(); - Type *SrcTy = GEPI->getSourceElementType(); - unsigned GEPIaddrspace = GEPI->getAddressSpace(); - - if (SrcTy->isArrayTy()) - DEBUG(errs() << *SrcTy << " is an array type! " - << *(SrcTy->getArrayElementType()) << "\n"); - else - DEBUG(errs() << *SrcTy << " is not an array type!\n"); - // check that source element type is float - if (SrcTy->isArrayTy()) { - if (!(SrcTy->getArrayElementType()->isFloatTy())) { - DEBUG(errs() << "GEPI type is array but not float!\n"); - continue; - } - } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) { - DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); - // does not fit this pattern - no float GEP instruction - continue; - } - // check that addressspace is 1 - // if (GEPIaddrspace != 1) { - // // does not fit this pattern - addrspace of pointer - // argument is not global continue; - // } - if (!(GEPI->hasOneUse())) { - // does not fit this pattern - more than one uses - // continue; - // Keep GEPI around if it has other uses - keepGEPI = true; - } - DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); - - // 1st GEPI it has one use - // assert(GEPI->hasOneUse() && "GEPI has a single use"); - - // See if it is a bitcast - BitCastInst *BitCastI; - for (User *U : GEPI->users()) { - if (Instruction *ui = dyn_cast<Instruction>(U)) { - DEBUG(errs() << "--" << *ui << "\n"); - if (isa<BitCastInst>(ui)) { - BitCastI = dyn_cast<BitCastInst>(ui); - DEBUG(errs() << "---Found bitcast as only use of GEP\n"); - break; - } - } - DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); - cont = true; - } - // for (Value::user_iterator ui = GEPI->user_begin(), - // ue = GEPI->user_end(); ui!=ue; ++ui) { - // DEBUG(errs() << "--" << *ui << "\n"); - // if (isa<BitCastInst>(*ui)) { - // BitCastI = dyn_cast<BitCastInst>(*ui); - // DEBUG(errs() << "Found bitcast as only use of GEP\n"); - // } - // } - - if (cont /*!BitCastI*/) { - continue; // not in pattern - } - - // DEBUG(errs() << *BitCastI << "\n"); - // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand - // has to be the GEP, since this is a use of the GEP. - Value *Op2 = BitCastI->getOperand(0); - DEBUG(errs() << "----" << *Op2 << "\n"); - // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); - // Type *OpTy = cast<Type>(Op2); - Type *OpTy = BitCastI->getDestTy(); - DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); - // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << - // "\n"); - if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { - // maybe right syntax is (Type::getInt32Ty)->getPointerTo() - continue; // not in pattern - } - - DEBUG(errs() << "----Here!\n"); - // We are in GEP, bitcast. - - // user_iterator, to find the load. - - if (!(BitCastI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - } - DEBUG(errs() << "----Bitcast has one use!\n"); - // it has one use - assert(BitCastI->hasOneUse() && "BitCastI has a single use"); - LoadInst *LoadI; - for (User *U : BitCastI->users()) { - if (Instruction *ui = dyn_cast<Instruction>(U)) { - DEBUG(errs() << "-----" << *ui << "\n"); - if (isa<LoadInst>(ui)) { - LoadI = dyn_cast<LoadInst>(ui); - DEBUG(errs() << "-----Found load as only use of bitcast\n"); - break; + break; + case Intrinsic::visc_atomic_add: + case Intrinsic::visc_atomic_sub: + case Intrinsic::visc_atomic_xchg: + case Intrinsic::visc_atomic_min: + case Intrinsic::visc_atomic_max: + case Intrinsic::visc_atomic_and: + case Intrinsic::visc_atomic_or: + case Intrinsic::visc_atomic_xor: + { + DEBUG(errs() << *II << "\n"); + // Only have support for i32 atomic intrinsics + assert(II->getType() == Type::getInt32Ty(II->getContext()) + && "Only support i32 atomic intrinsics for now"); + // Substitute with atomicrmw instruction + assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics"); + Value* Ptr = II->getArgOperand(0); + Value* Val = II->getArgOperand(1); + assert(Ptr->getType()->isPointerTy() + && "First argument of supported atomics is expected to be a pointer"); + PointerType* PtrTy = cast<PointerType>(Ptr->getType()); + PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); + if (PtrTy != TargetTy) { + Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); + PtrTy = TargetTy; } - } - DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); - cont = true; - } - // for (Value::user_iterator ui = BitCastI->user_begin(), - // ue = BitCastI->user_end(); ui!=ue; ++ui) { - // if (isa<LoadInst>(*ui)) { - // LoadI = dyn_cast<LoadInst>(*ui); - // errs() << "Found load as only use of bitcast\n"; - // } - // } - - if (cont) { - continue; // not in pattern - } - DEBUG("HERE!\n"); - // check that we load from pointer we got from bitcast - assert - the unique - // argument must be the use we found it from - assert(LoadI->getPointerOperand() == BitCastI && - "Unexpected Load Instruction Operand\n"); - - // Copy user_iterator, to find the store. - - if (!(LoadI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - // TODO: generalize: one load can have more than one store users - } - - // it has one use - assert(LoadI->hasOneUse() && "LoadI has a single use"); - Value::user_iterator ui = LoadI->user_begin(); - // skipped loop, because is has a single use - StoreInst *StoreI = dyn_cast<StoreInst>(*ui); - if (!StoreI) { - continue; // not in pattern - } - - // Also check that the store uses the loaded value as the value operand - if (StoreI->getValueOperand() != LoadI) { - continue; - } - - DEBUG(errs() << "-------Found store instruction\n"); - - // Look for its bitcast, which is its pointer operand - Value *StPtrOp = StoreI->getPointerOperand(); - DEBUG(errs() << "-------" << *StPtrOp << "\n"); - BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); - DEBUG(errs() << "-------" << *BitCastI2 << "\n"); - if (!BitCastI2) { - continue; // not in pattern - } - - DEBUG(errs() << "-------- Found Bit Cast of store!\n"); - // found bitcast. Look for the second GEP, its from operand. - Value *BCFromOp = BitCastI2->getOperand(0); - GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); - DEBUG(errs() << "---------- " << *GEPI2 << "\n"); - if (!GEPI2) { - continue; // not in pattern - } - - if (!(GEPI2->hasOneUse())) { - // does not fit this pattern - more than one uses - // continue; - // Keep GEPI around if it has other uses - keepGEPI2 = true; - } - DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); - - Value *PtrOp2 = GEPI2->getPointerOperand(); - - // Found GEPI2. TODO: kind of confused as o what checks I need to add here, - // let's add them together- all the code for int-float type checks is - // already above. - - // Assume we found pattern - if (!keepGEPI) { - IItoRemove.push_back(GEPI); - DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); - } else { - DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); - } - IItoRemove.push_back(BitCastI); - DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); - IItoRemove.push_back(LoadI); - DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); - IItoRemove.push_back(GEPI2); - DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); - IItoRemove.push_back(BitCastI2); - DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); - if (!keepGEPI2) { - IItoRemove.push_back(StoreI); - DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); - } else { - - DEBUG(errs() << "Keeping " << *StoreI - << " since it has multiple uses!\n"); - } - - std::vector<Value *> GEPlIndex; - if (GEPI->hasIndices()) { - for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); - GEPlIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); - - std::vector<Value *> GEPsIndex; - if (GEPI2->hasIndices()) { - for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); - GEPsIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); - - // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); - GetElementPtrInst *newlGEP = GetElementPtrInst::Create( - GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp, // operand from 1st GEP - ArrayRef<Value *>(GEPlIndex), Twine(), StoreI); - DEBUG(errs() << "Adding: " << *newlGEP << "\n"); - // insert load before GEPI - LoadInst *newLoadI = - new LoadInst(Type::getFloatTy(M.getContext()), - newlGEP, // new GEP - Twine(), LoadI->isVolatile(), LoadI->getAlignment(), - LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI); - DEBUG(errs() << "Adding: " << *newLoadI << "\n"); - // same for GEP for store, for store operand - GetElementPtrInst *newsGEP = GetElementPtrInst::Create( - GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp2, // operand from 2nd GEP - ArrayRef<Value *>(GEPsIndex), Twine(), StoreI); - DEBUG(errs() << "Adding: " << *newsGEP << "\n"); - // insert store before GEPI - StoreInst *newStoreI = - new StoreInst(newLoadI, - newsGEP, // new GEP - StoreI->isVolatile(), StoreI->getAlignment(), - StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI); - DEBUG(errs() << "Adding: " << *newStoreI << "\n"); - } + std::string name; + if(II->getIntrinsicID() == Intrinsic::visc_atomic_add) + name = "atomic_add"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub) + name = "atomic_sub"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) + name = "atomic_xchg"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min) + name = "atomic_min"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max) + name = "atomic_max"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and) + name = "atomic_and"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or) + name = "atomic_or"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor) + name = "atomic_xor"; + Type* paramTypes[] = {PtrTy, Val->getType()}; + FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false); + FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT); + + Value* Params[] = {Ptr, Val}; + CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II); + DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); + II->replaceAllUsesWith(AtomCI); + IItoRemove.push_back(II); + } + break; + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } + + } + else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { + IRBuilder<> Builder(I); + Value *Source = MemCpyI->getSource(); + Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); + Value *Length = MemCpyI->getOperand(2); + DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); + DEBUG(errs() << "Source: " << *Source << "\n"); + DEBUG(errs() << "Destination: " << *Destination << "\n"); + DEBUG(errs() << "Length: " << *Length << "\n"); + + size_t memcpy_length; + unsigned int memcpy_count; + if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) { + if (CI->getBitWidth() <= 64) { + memcpy_length = CI->getSExtValue(); + DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); + Type *Source_Type = Source->getType()->getPointerElementType(); + DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); + memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); + DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); + if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) { + if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) { + Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); + Value *DestPtrOperand = destGEPI->getPointerOperand(); + for(int i = 0; i < memcpy_count; ++i) { + Constant *increment; + LoadInst *newLoadI; + StoreInst *newStoreI; + // First, need to increment the correct index for both source and dest + // This invluves checking to see how many indeces the GEP has + // Assume for now only 1 or 2 are the viable options. + + std::vector<Value*> GEPlIndex; + if (sourceGEPI->getNumIndices() == 1) { + Value *Index = sourceGEPI->getOperand(1); + increment = ConstantInt::get(Index->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPlIndex.push_back(incAdd); + Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex)); + DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); + newLoadI = Builder.CreateLoad(newGEPIl); + DEBUG(errs() << "Load: " << *newLoadI << "\n"); + } else { + llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n"); + } + + + std::vector<Value*> GEPsIndex; + if (destGEPI->getNumIndices() == 1) { + + } else if (destGEPI->getNumIndices() == 2) { + Value *Index0 = destGEPI->getOperand(1); + GEPsIndex.push_back(Index0); + Value *Index1 = destGEPI->getOperand(2); + increment = ConstantInt::get(Index1->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index1, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPsIndex.push_back(incAdd); + Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex)); + DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); + newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile()); + DEBUG(errs() << "Store: " << *newStoreI << "\n"); + } else { + llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n"); + } + } + IItoRemove.push_back(sourceGEPI); + IItoRemove.push_back(destGEPI); + Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); + Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); + IItoRemove.push_back(destBitcastI); + IItoRemove.push_back(sourceBitcastI); + IItoRemove.push_back(MemCpyI); + } + } + + } + } else { + llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); + } + // llvm_unreachable("HERE!"); + } + + else if(CallInst* CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if(calleeF->isDeclaration()) { + // Add the declaration to kernel module + if (calleeF->getName() == "sqrtf") { + calleeF->setName(Twine("sqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } else if (calleeF->getName() == "rsqrtf") { + calleeF->setName(Twine("rsqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); + } + else { + // Check if the called function has already been cloned before. + Function *NewFunc = CloneAndReplaceCall(CI, calleeF); + // Iterate over the new function to see if it calls any other functions + // in the module. + for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { + if(auto *Call = dyn_cast<CallInst>(&*i)) { + Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); + CloneAndReplaceCall(Call, CalledFunc); + } + } + } + //TODO: how to handle address space qualifiers in load/store + } + + } + // search for pattern where float is being casted to int and loaded/stored and change it. + DEBUG(errs() << "finding pattern for replacement!\n"); + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + bool cont = false; + bool keepGEPI = false; + bool keepGEPI2= false; + Instruction *I = &(*i); + GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I); + + if (!GEPI) { + // did nod find pattern start, continue + continue; + } + // may have found pattern, check + DEBUG(errs() << "GEPI " << *GEPI << "\n"); + // print whatever we want for debug + Value* PtrOp = GEPI->getPointerOperand(); + Type *SrcTy = GEPI->getSourceElementType(); + unsigned GEPIaddrspace = GEPI->getAddressSpace(); + + if (SrcTy->isArrayTy()) + DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n"); + else + DEBUG(errs() << *SrcTy << " is not an array type!\n"); + // check that source element type is float + if (SrcTy->isArrayTy()) { + if (!(SrcTy->getArrayElementType()->isFloatTy())) { + DEBUG(errs() << "GEPI type is array but not float!\n"); + continue; + } + } + else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) { + DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); + // does not fit this pattern - no float GEP instruction + continue; + } + // check that addressspace is 1 + // if (GEPIaddrspace != 1) { + // // does not fit this pattern - addrspace of pointer argument is not global + // continue; + // } + if (!(GEPI->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI = true; + } + DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); + + // 1st GEPI it has one use + // assert(GEPI->hasOneUse() && "GEPI has a single use"); + + // See if it is a bitcast + BitCastInst *BitCastI; + for (User * U : GEPI->users()) { + if(Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "--" << *ui << "\n"); + if (isa<BitCastInst>(ui)) { + BitCastI = dyn_cast<BitCastInst>(ui); + DEBUG(errs() << "---Found bitcast as only use of GEP\n"); + break; + } + } + DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); + cont = true; + } + // for (Value::user_iterator ui = GEPI->user_begin(), + // ue = GEPI->user_end(); ui!=ue; ++ui) { + // DEBUG(errs() << "--" << *ui << "\n"); + // if (isa<BitCastInst>(*ui)) { + // BitCastI = dyn_cast<BitCastInst>(*ui); + // DEBUG(errs() << "Found bitcast as only use of GEP\n"); + // } + // } + + if (cont/*!BitCastI*/) { + continue; // not in pattern + } + + // DEBUG(errs() << *BitCastI << "\n"); + // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP. + Value *Op2 = BitCastI->getOperand(0); + DEBUG(errs() << "----" << *Op2 << "\n"); + // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); + // Type *OpTy = cast<Type>(Op2); + Type *OpTy = BitCastI->getDestTy(); + DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); + // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n"); + if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { + // maybe right syntax is (Type::getInt32Ty)->getPointerTo() + continue; // not in pattern + } + + DEBUG(errs() << "----Here!\n"); + // We are in GEP, bitcast. + + // user_iterator, to find the load. + + if (!(BitCastI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + } + DEBUG(errs() << "----Bitcast has one use!\n"); + // it has one use + assert(BitCastI->hasOneUse() && "BitCastI has a single use"); + LoadInst *LoadI; + for (User * U : BitCastI->users()) { + if (Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "-----" << *ui << "\n"); + if (isa<LoadInst>(ui)) { + LoadI = dyn_cast<LoadInst>(ui); + DEBUG(errs() << "-----Found load as only use of bitcast\n"); + break; + } + } + DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); + cont = true; + } + // for (Value::user_iterator ui = BitCastI->user_begin(), + // ue = BitCastI->user_end(); ui!=ue; ++ui) { + // if (isa<LoadInst>(*ui)) { + // LoadI = dyn_cast<LoadInst>(*ui); + // errs() << "Found load as only use of bitcast\n"; + // } + // } + + if (cont) { + continue; // not in pattern + } + + DEBUG("HERE!\n"); + // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from + assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n"); + + // Copy user_iterator, to find the store. + + if (!(LoadI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + // TODO: generalize: one load can have more than one store users + } + + // it has one use + assert(LoadI->hasOneUse() && "LoadI has a single use"); + Value::user_iterator ui = LoadI->user_begin(); + // skipped loop, because is has a single use + StoreInst *StoreI = dyn_cast<StoreInst>(*ui); + if (!StoreI) { + continue; // not in pattern + } + + // Also check that the store uses the loaded value as the value operand + if (StoreI->getValueOperand() != LoadI) { + continue; + } + + DEBUG(errs() << "-------Found store instruction\n"); + + // Look for its bitcast, which is its pointer operand + Value *StPtrOp = StoreI->getPointerOperand(); + DEBUG(errs() << "-------" << *StPtrOp << "\n"); + BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); + DEBUG(errs() << "-------" << *BitCastI2 << "\n"); + if (!BitCastI2) { + continue; //not in pattern + } + + DEBUG(errs() << "-------- Found Bit Cast of store!\n" ); + // found bitcast. Look for the second GEP, its from operand. + Value *BCFromOp = BitCastI2->getOperand(0); + GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); + DEBUG(errs() << "---------- " << *GEPI2 << "\n"); + if (!GEPI2) { + continue; //not in pattern + } + + if (!(GEPI2->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI2 = true; + } + DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); + + Value *PtrOp2 = GEPI2->getPointerOperand(); + + // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above. + + // Assume we found pattern + if (!keepGEPI) { + IItoRemove.push_back(GEPI); + DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); + } else { + DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); + } + IItoRemove.push_back(BitCastI); + DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); + IItoRemove.push_back(LoadI); + DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); + IItoRemove.push_back(GEPI2); + DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); + IItoRemove.push_back(BitCastI2); + DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); + if (!keepGEPI2) { + IItoRemove.push_back(StoreI); + DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); + } else { + + DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n"); + } + + std::vector<Value*> GEPlIndex; + if (GEPI->hasIndices()) { + for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); + GEPlIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); + + std::vector<Value*> GEPsIndex; + if (GEPI2->hasIndices()) { + for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); + GEPsIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); + + + + // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); + GetElementPtrInst* newlGEP = + GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()), + PtrOp, // operand from 1st GEP + ArrayRef<Value*>(GEPlIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newlGEP << "\n"); + // insert load before GEPI + LoadInst *newLoadI = + new LoadInst(Type::getFloatTy(M.getContext()), + newlGEP, // new GEP + Twine(), + LoadI->isVolatile(), + LoadI->getAlignment(), + LoadI->getOrdering(), + LoadI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newLoadI << "\n"); + // same for GEP for store, for store operand + GetElementPtrInst* newsGEP = + GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp2, // operand from 2nd GEP + ArrayRef<Value*>(GEPsIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newsGEP << "\n"); + // insert store before GEPI + StoreInst *newStoreI = + new StoreInst(newLoadI, + newsGEP, // new GEP + StoreI->isVolatile(), + StoreI->getAlignment(), + StoreI->getOrdering(), + StoreI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newStoreI << "\n"); + + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (auto *I : reverse(IItoRemove)) { + DEBUG(errs() << "Erasing: " << *I << "\n"); + I->eraseFromParent(); + } + + // Removed the cloned functions from the parent module into the new module + for(auto *F : FuncToBeRemoved) { + F->removeFromParent(); //TODO: MARIA check + KernelM->getFunctionList().push_back(F); + } + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << *KernelM); + + return; +} - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (auto *I : reverse(IItoRemove)) { - DEBUG(errs() << "Erasing: " << *I << "\n"); - I->eraseFromParent(); - } +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_NVPTX PASS\n"; - // Removed the cloned functions from the parent module into the new module - for (auto *F : FuncToBeRemoved) { - F->removeFromParent(); // TODO: MARIA check - KernelM->getFunctionList().push_back(F); - } + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() - << "\n"); - DEBUG(errs() << *KernelM); + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - return; -} + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); -bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n"); - - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); - - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode *> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = - // DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap - // = DFG.getHandleToDFEdgeMap(); - - // Visitor for Code Generation Graph Traversal - CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); - - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode : Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - } + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } - CGTVisitor->writeKernelsModule(); + CGTVisitor->writeKernelsModule(); - // TODO: Edit module epilogue to remove the HPVM intrinsic declarations - delete CGTVisitor; + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; - return true; + return true; } std::string CGT_NVPTX::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); } -void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!"); - PointerType *OldTy = cast<PointerType>(V->getType()); - PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; - ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) { - if (PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } - } - } +void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } } -std::vector<unsigned> -CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs, - Function *F) { - std::vector<unsigned> ConstantMemArgs; - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Argument *arg = &*ai; - std::vector<unsigned>::iterator pos = std::find( - GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo()); - // It has to be a global memory argument to be promotable - if (pos == GlobalMemArgs->end()) - continue; - - // Check if it can/should be promoted - if (canBePromoted(arg, F)) { - DEBUG(errs() << "Promoting << " << arg->getName() - << " to constant memory." - << "\n"); - ConstantMemArgs.push_back(arg->getArgNo()); - GlobalMemArgs->erase(pos); - } - } - return ConstantMemArgs; -} -Function *CGT_NVPTX::changeArgAddrspace(Function *F, - std::vector<unsigned> &Args, - unsigned addrspace) { - unsigned idx = 0; - std::vector<Type *> ArgTypes; - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Argument *arg = &*ai; - DEBUG(errs() << *arg << "\n"); - unsigned argno = arg->getArgNo(); - if ((idx < Args.size()) && (argno == Args[idx])) { - fixValueAddrspace(arg, addrspace); - idx++; - } - ArgTypes.push_back(arg->getType()); - } - FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - - // F->mutateType(PTy); - Function *newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); +std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { + std::vector<unsigned> ConstantMemArgs; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument* arg = &*ai; + std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), + GlobalMemArgs->end(), arg->getArgNo()); + // It has to be a global memory argument to be promotable + if(pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if(canBePromoted(arg, F)) { + errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"; + ConstantMemArgs.push_back(arg->getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; +} - DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n"); - return newF; +Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument *arg = &*ai; + DEBUG(errs() << *arg << "\n"); + unsigned argno = arg->getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(arg, addrspace); + idx++; + } + ArgTypes.push_back(arg->getType()); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); + return newF; } /* Add metadata to module KernelM, for OpenCL kernels */ void CGT_NVPTX::addCLMetadata(Function *F) { - IRBuilder<> Builder(&*F->begin()); - - SmallVector<Metadata *, 8> KernelMD; - KernelMD.push_back(ValueAsMetadata::get(F)); - - // TODO: There is additional metadata used by kernel files but we skip them as - // they are not mandatory. In future they might be useful to enable - // optimizations - - MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_kernels = - KernelM->getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); - - KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(ValueAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_annotations = - KernelM->getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); -} + IRBuilder<> Builder(&*F->begin()); -void CGT_NVPTX::writeKernelsModule() { + SmallVector<Metadata*,8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); - // In addition to deleting all other functions, we also want to spiff it - // up a little bit. Do this now. - legacy::PassManager Passes; + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations - DEBUG(errs() << "Writing to File --- "); - DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n"); - std::error_code EC; - ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); - if (EC) { - DEBUG(errs() << EC.message() << '\n'); - } + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); - Passes.add(createPrintModulePass(Out.os())); + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); - Passes.run(*KernelM); - - // Declare success. - Out.keep(); } -Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { - - DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); - // FIXME: Maybe do that using the Node? - StructType *FRetTy = dyn_cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); +void CGT_NVPTX::writeKernelsModule() { - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; - std::vector<Type *> RetArgTypes; - std::vector<Argument *> RetArgs; - std::vector<Argument *> Args; - // Check for { } return struct, which means that the function returns void - if (FRetTy->isEmptyTy()) { + errs() << "Writing to File --- "; + errs() << getKernelsModuleName(M).c_str() << "\n"; + std::error_code EC; + ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << '\n'; + } - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); + Passes.add( + createPrintModulePass(Out.os())); - // Replacing return statements with others returning void - for (auto *RI : RItoRemove) { - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - } else { - // The struct has return values, thus needs to be converted to parameter - - // Iterate over all element types of return struct and add arguments to the - // function - for (unsigned i = 0; i < FRetTy->getNumElements(); i++) { - Argument *RetArg = - new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); - RetArgs.push_back(RetArg); - RetArgTypes.push_back(RetArg->getType()); - DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); - } + Passes.run(*KernelM); - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (auto *RI : RItoRemove) { - Value *RetVal = RI->getReturnValue(); - for (unsigned i = 0; i < RetArgs.size(); i++) { - ExtractValueInst *EI = ExtractValueInst::Create( - RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI); - new StoreInst(EI, RetArgs[i], RI); - } - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); - - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - } - DEBUG(errs() << "\tReplaced return statements\n"); - - // Create the argument type list with the added argument's type - std::vector<Type *> ArgTypes; - for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - for (auto *RATy : RetArgTypes) { - ArgTypes.push_back(RATy); - } - - // Creating Args vector to use in cloning! - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Args.push_back(&*ai); - } - for (auto *ai : RetArgs) { - Args.push_back(ai); - } + // Declare success. + Out.keep(); +} - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type *VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - - // Change the function type - // F->mutateType(PTy); - Function *newF = cloneFunction(F, newFT, false, NULL, &Args); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - // F->eraseFromParent(); - return newF; +Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { + + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + std::vector<Type *> RetArgTypes; + std::vector<Argument*> RetArgs; + std::vector<Argument*> Args; + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (auto *RI : RItoRemove) { + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } + else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + for (unsigned i=0; i<FRetTy->getNumElements(); i++) { + Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + RetArgs.push_back(RetArg); + RetArgTypes.push_back(RetArg->getType()); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (auto *RI : RItoRemove) { + Value* RetVal = RI->getReturnValue(); + for(unsigned i = 0; i < RetArgs.size(); i++) { + ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), + RetArgs[i]->getName()+".val", RI); + new StoreInst(EI, RetArgs[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + for(auto *RATy: RetArgTypes) { + ArgTypes.push_back(RATy); + } + + // Creating Args vector to use in cloning! + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Args.push_back(&*ai); + } + for(auto *ai : RetArgs) { + Args.push_back(ai); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false, NULL, &Args); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + //F->eraseFromParent(); + return newF; } /****************************************************************************** @@ -2138,344 +2102,314 @@ Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { // 1. No stores // 2. Loads not dependent on getNodeInstanceID itrinsic -static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList, - std::vector<Value *> *VisitedList) { - if (std::find(VisitedList->begin(), VisitedList->end(), V) != - VisitedList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - VisitedList->push_back(V); - for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; - ++ui) { - Instruction *I = dyn_cast<Instruction>(*ui); - if (!I) { - // if use is not an instruction, then skip it - continue; - } - DEBUG(errs() << "\t" << *I << "\n"); - if (isa<LoadInst>(I)) { - DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); - DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); - UseList->push_back(V); - } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { - // found a store in use chain - DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); - return true; - } else if (BuildDFG::isHPVMIntrinsic(I)) { - // If it is an atomic intrinsic, we found a store - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - assert(II && - II->getCalledValue()->getName().startswith("llvm.hpvm.atomic") && - "Only hpvm atomic intrinsics can have an argument as input"); - return true; - } else { - DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); - if (findLoadStoreUses(I, UseList, VisitedList)) - return true; - } - } - return false; +static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { + if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); + ui != ue; ++ui) { + Instruction* I = dyn_cast<Instruction>(*ui); + if(!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if(isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } + else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } + else if(BuildDFG::isViscIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") + && "Only visc atomic intrinsics can have an argument as input"); + return true; + } + else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if(findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; } -static bool isDependentOnNodeInstanceID(Value *V, - std::vector<Value *> *DependenceList) { - if (std::find(DependenceList->begin(), DependenceList->end(), V) != - DependenceList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - DependenceList->push_back(V); - // If not an instruction, then not dependent on node instance id - if (!isa<Instruction>(V) || isa<Constant>(V)) { - DEBUG(errs() << "\tStop\n"); - return false; - } - - Instruction *I = cast<Instruction>(V); - for (unsigned i = 0; i < I->getNumOperands(); i++) { - Value *operand = I->getOperand(i); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) { - if ((II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_x || - II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_y || - II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_z)) { - Value *Node = II->getArgOperand(0); - IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node); - assert( - GN && - "NodeInstanceID operande should be node/parent node intrinsic\n"); - if (GN->getIntrinsicID() == Intrinsic::hpvm_getNode) { - DEBUG(errs() << "\tDependency found on Node instance ID: " << *II - << "\n"); - return true; - } - } - } - if (CmpInst *CI = dyn_cast<CmpInst>(operand)) { - DEBUG(errs() << "Found compare instruction: " << *CI - << "\nNot following its dependency list\n"); - continue; - } - DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n"); - if (isDependentOnNodeInstanceID(operand, DependenceList)) { - return true; - } - } - return false; +static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { + if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if(!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction* I = cast<Instruction>(V); + for(unsigned i = 0; i < I->getNumOperands(); i++) { + Value* operand = I->getOperand(i); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { + if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { + Value* Node = II->getArgOperand(0); + IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); + assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); + if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); + return true; + } + } + } + if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); + continue; + } + DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if(isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; } // Function to check if argument arg can be changed to a constant memory pointer -static bool canBePromoted(Argument *arg, Function *F) { - DEBUG(errs() << "OPT: Check if Argument " << *arg - << " can be changed to constant memory\n"); - std::vector<Value *> UseList; - std::vector<Value *> VisitedList; - // recursively traverse use chain - // if find a store instruction return false, everything fails, cannot be - // promoted - // if find a load instruction as use, add the GEP instruction to list - bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); - if (foundStore == true) - return false; - // See that the GEP instructions are not dependent on getNodeInstanceID - // intrinsic - DEBUG(errs() << foundStore - << "\tNo Store Instruction found. Check dependence on node " - "instance ID\n"); - std::vector<Value *> DependenceList; - for (auto U : UseList) { - if (isDependentOnNodeInstanceID(U, &DependenceList)) - return false; - } - DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); - return true; +static bool canBePromoted(Argument* arg, Function* F) { + DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); + std::vector<Value*> UseList; + std::vector<Value*> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if(foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); + std::vector<Value*>DependenceList; + for(auto U: UseList) { + if(isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; } + // Calculate execute node parameters which include, number of diemnsions for // dynamic instances of the kernel, local and global work group sizes. -static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr, - Value *&GlobalWGPtr, Kernel *kernel, - ValueToValueMapTy &VMap, Instruction *IB) { - - // Assign number of dimenstions a constant value - workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); - - // If local work group size if null - if (!kernel->hasLocalWG()) { - LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); - } else { - for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { - if (isa<Argument>(kernel->localWGSize[i])) - kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; - } - LocalWGPtr = - genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); - } - - for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) { - if (isa<Argument>(kernel->globalWGSize[i])) - kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; - } - - // For OpenCL, global work group size is the total bumber of instances in each - // dimension. So, multiply local and global dim limits. - std::vector<Value *> globalWGSizeInsts; - if (kernel->hasLocalWG()) { - for (unsigned i = 0; i < kernel->gridDim; i++) { - BinaryOperator *MulInst = - BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], - kernel->localWGSize[i], "", IB); - globalWGSizeInsts.push_back(MulInst); - } - } else { - globalWGSizeInsts = kernel->globalWGSize; - } - GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); - DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if(!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } + else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + if(kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } + else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } // CodeGen for allocating space for Work Group on stack and returning a pointer // to its address -static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize, - ValueToValueMapTy &VMap, Instruction *IB, - const Twine &WGName) { - Value *WGPtr; - // Get int64_t and or ease of use - Type *Int64Ty = Type::getInt64Ty(M.getContext()); - - // Work Group type is [#dim x i64] - Type *WGTy = ArrayType::get(Int64Ty, WGSize.size()); - // Allocate space of Global work group data on stack and get pointer to - // first element. - AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB); - WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), - WG->getName() + ".0", IB); - Value *nextDim = WGPtr; - DEBUG(errs() << *WGPtr << "\n"); - - // Iterate over the number of dimensions and store the global work group - // size in that dimension - for (unsigned i = 0; i < WGSize.size(); i++) { - DEBUG(errs() << *WGSize[i] << "\n"); - assert(WGSize[i]->getType()->isIntegerTy() && - "Dimension not an integer type!"); - - if (WGSize[i]->getType() != Int64Ty) { - // If number of dimensions are mentioned in any other integer format, - // generate code to extend it to i64. We need to use the mapped value in - // the new generated function, hence the use of VMap - // FIXME: Why are we changing the kernel WGSize vector here? - DEBUG(errs() << "Not i64. Zero extend required.\n"); - DEBUG(errs() << *WGSize[i] << "\n"); - CastInst *CI = - BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); - DEBUG(errs() << "Bitcast done.\n"); - StoreInst *SI = new StoreInst(CI, nextDim, IB); - DEBUG(errs() << "Zero extend done.\n"); - DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); - } else { - // Store the value representing work group size in ith dimension on - // stack - StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { + Value* WGPtr; + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); + Value* nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + if(WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst* SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + WG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; - DEBUG(errs() << "\t Work group size: " << *SI << "\n"); - } - if (i + 1 < WGSize.size()) { - // Move to next dimension - GetElementPtrInst *GEP = GetElementPtrInst::Create( - nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)), - WG->getName() + "." + Twine(i + 1), IB); - DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); - nextDim = GEP; - } - } - return WGPtr; } // Get generated PTX binary name -static std::string getPTXFilename(const Module &M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".kernels.cl"); - return moduleID; +static std::string getPTXFilename(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".kernels.cl"); + return moduleID; } // Get the name of the input file from module ID -static std::string getFilenameFromModule(const Module &M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/") + 1); +static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); } // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; - std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else - assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else assert(false && "Invalid PTX target"); - return; + return; } static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else - assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else assert(false && "Invalid PTX target"); - return; + return; } // Helper function, populate a vector with all return statements in a function -static void findReturnInst(Function *F, - std::vector<ReturnInst *> &ReturnInstVec) { - for (auto &BB : *F) { - if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) - ReturnInstVec.push_back(RI); - } +static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + for (auto &BB : *F) { + if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) + ReturnInstVec.push_back(RI); + } } -// Helper function, populate a vector with all IntrinsicID intrinsics in a -// function -static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID, - std::vector<IntrinsicInst *> &IntrinsicInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - if (II && II->getIntrinsicID() == IntrinsicID) { - IntrinsicInstVec.push_back(II); - } - } +// Helper function, populate a vector with all IntrinsicID intrinsics in a function +static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } } -// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic -// op +// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { - switch (ID) { - case Intrinsic::hpvm_atomic_add: - return AtomicRMWInst::Add; - case Intrinsic::hpvm_atomic_sub: - return AtomicRMWInst::Sub; - case Intrinsic::hpvm_atomic_min: - return AtomicRMWInst::Min; - case Intrinsic::hpvm_atomic_umin: - return AtomicRMWInst::UMin; - case Intrinsic::hpvm_atomic_max: - return AtomicRMWInst::Max; - case Intrinsic::hpvm_atomic_umax: - return AtomicRMWInst::UMax; - // case Intrinsic::hpvm_atomic_inc: return AtomicRMWInst::Inc; - // case Intrinsic::hpvm_atomic_dec: return AtomicRMWInst::Dec; - case Intrinsic::hpvm_atomic_xchg: - return AtomicRMWInst::Xchg; - case Intrinsic::hpvm_atomic_and: - return AtomicRMWInst::And; - case Intrinsic::hpvm_atomic_or: - return AtomicRMWInst::Or; - case Intrinsic::hpvm_atomic_xor: - return AtomicRMWInst::Xor; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch(ID) { + case Intrinsic::visc_atomic_add: + return AtomicRMWInst::Add; + case Intrinsic::visc_atomic_sub: + return AtomicRMWInst::Sub; + case Intrinsic::visc_atomic_min: + return AtomicRMWInst::Min; + case Intrinsic::visc_atomic_max: + return AtomicRMWInst::Max; + case Intrinsic::visc_atomic_xchg: + return AtomicRMWInst::Xchg; + case Intrinsic::visc_atomic_and: + return AtomicRMWInst::And; + case Intrinsic::visc_atomic_or: + return AtomicRMWInst::Or; + case Intrinsic::visc_atomic_xor: + return AtomicRMWInst::Xor; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } + // Helper funtion, returns the OpenCL function name, corresponding to atomic op static std::string getAtomicOpName(Intrinsic::ID ID) { - switch (ID) { - case Intrinsic::hpvm_atomic_cmpxchg: - return "atom_cmpxchg"; - case Intrinsic::hpvm_atomic_add: - return "atom_add"; - case Intrinsic::hpvm_atomic_sub: - return "atom_sub"; - case Intrinsic::hpvm_atomic_min: - return "atom_min"; - case Intrinsic::hpvm_atomic_max: - return "atom_max"; - case Intrinsic::hpvm_atomic_inc: - return "atom_inc"; - case Intrinsic::hpvm_atomic_dec: - return "atom_dec"; - case Intrinsic::hpvm_atomic_xchg: - return "atom_xchg"; - case Intrinsic::hpvm_atomic_and: - return "atom_and"; - case Intrinsic::hpvm_atomic_or: - return "atom_or"; - case Intrinsic::hpvm_atomic_xor: - return "atom_xor"; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch(ID) { + case Intrinsic::visc_atomic_add: + return "atom_add"; + case Intrinsic::visc_atomic_sub: + return "atom_sub"; + case Intrinsic::visc_atomic_min: + return "atom_min"; + case Intrinsic::visc_atomic_max: + return "atom_max"; + case Intrinsic::visc_atomic_xchg: + return "atom_xchg"; + case Intrinsic::visc_atomic_and: + return "atom_and"; + case Intrinsic::visc_atomic_or: + return "atom_or"; + case Intrinsic::visc_atomic_xor: + return "atom_xor"; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } } // End of namespace @@ -2486,3 +2420,4 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", false /* does not modify the CFG */, true /* transformation, * * not just analysis */); + diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 8152817d9a..a0fa9fcde4 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "DFG2LLVM_X86" -#include "SupportHPVM/DFG2LLVM.h" +#include "SupportVISC/DFG2LLVM.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" @@ -25,29 +25,29 @@ using namespace llvm; using namespace builddfg; using namespace dfg2llvm; -// HPVM Command line option to use timer or not -static cl::opt<bool> HPVMTimer_X86("hpvm-timers-x86", - cl::desc("Enable hpvm timers")); +// VISC Command line option to use timer or not +static cl::opt<bool> VISCTimer_X86("visc-timers-x86", + cl::desc("Enable visc timers")); // Command line option to enable device abstraction or not static cl::opt<bool> - DeviceAbstraction("hpvm-eda", cl::init(false), cl::Hidden, - cl::desc("Enable hpvm device abstraction")); + DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, + cl::desc("Enable visc device abstraction")); namespace { // Helper Functions -static bool isHPVMCall_llvm_hpvm_policy_getVersion(Instruction *I) { +static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { if (!isa<CallInst>(I)) return false; CallInst *CI = cast<CallInst>(I); return (CI->getCalledValue()->stripPointerCasts()->getName()) - .equals("llvm_hpvm_policy_getVersion"); + .equals("llvm_visc_policy_getVersion"); } -CallInst *get_llvm_hpvm_policy_getVersion_call(Function *F) { +CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { Instruction *I = &*ib; - if (isHPVMCall_llvm_hpvm_policy_getVersion(I)) + if (isVISCCall_llvm_visc_policy_getVersion(I)) return cast<CallInst>(I); } return NULL; @@ -74,27 +74,27 @@ private: // Member variables FunctionCallee malloc; - // HPVM Runtime API - FunctionCallee llvm_hpvm_x86_launch; - FunctionCallee llvm_hpvm_x86_wait; - FunctionCallee llvm_hpvm_x86_argument_ptr; - - FunctionCallee llvm_hpvm_streamLaunch; - FunctionCallee llvm_hpvm_streamPush; - FunctionCallee llvm_hpvm_streamPop; - FunctionCallee llvm_hpvm_streamWait; - FunctionCallee llvm_hpvm_createBindInBuffer; - FunctionCallee llvm_hpvm_createBindOutBuffer; - FunctionCallee llvm_hpvm_createEdgeBuffer; - FunctionCallee llvm_hpvm_createLastInputBuffer; - FunctionCallee llvm_hpvm_createThread; - // Constant* llvm_hpvm_freeThreads; - FunctionCallee llvm_hpvm_bufferPush; - FunctionCallee llvm_hpvm_bufferPop; - FunctionCallee llvm_hpvm_x86_dstack_push; - FunctionCallee llvm_hpvm_x86_dstack_pop; - FunctionCallee llvm_hpvm_x86_getDimLimit; - FunctionCallee llvm_hpvm_x86_getDimInstance; + // VISC Runtime API + FunctionCallee llvm_visc_x86_launch; + FunctionCallee llvm_visc_x86_wait; + FunctionCallee llvm_visc_x86_argument_ptr; + + FunctionCallee llvm_visc_streamLaunch; + FunctionCallee llvm_visc_streamPush; + FunctionCallee llvm_visc_streamPop; + FunctionCallee llvm_visc_streamWait; + FunctionCallee llvm_visc_createBindInBuffer; + FunctionCallee llvm_visc_createBindOutBuffer; + FunctionCallee llvm_visc_createEdgeBuffer; + FunctionCallee llvm_visc_createLastInputBuffer; + FunctionCallee llvm_visc_createThread; + // Constant* llvm_visc_freeThreads; + FunctionCallee llvm_visc_bufferPush; + FunctionCallee llvm_visc_bufferPop; + FunctionCallee llvm_visc_x86_dstack_push; + FunctionCallee llvm_visc_x86_dstack_pop; + FunctionCallee llvm_visc_x86_getDimLimit; + FunctionCallee llvm_visc_x86_getDimInstance; // Functions std::vector<IntrinsicInst *> *getUseList(Value *LI); @@ -120,7 +120,7 @@ private: // Virtual Functions void init() { - HPVMTimer = HPVMTimer_X86; + VISCTimer = VISCTimer_X86; TargetName = "X86"; } void initRuntimeAPI(); @@ -177,7 +177,7 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { return true; } -// Initialize the HPVM runtime API. This makes it easier to insert these calls +// Initialize the VISC runtime API. This makes it easier to insert these calls void CGT_X86::initRuntimeAPI() { // Load Runtime API Module @@ -187,51 +187,51 @@ void CGT_X86::initRuntimeAPI() { assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/hpvm-rt/hpvm-rt.ll"; + Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); if (runtimeModule == NULL) DEBUG(errs() << Err.getMessage()); else - DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_hpvm_x86_launch); + DECLARE(llvm_visc_x86_launch); DECLARE(malloc); - DECLARE(llvm_hpvm_x86_wait); - DECLARE(llvm_hpvm_x86_argument_ptr); - DECLARE(llvm_hpvm_streamLaunch); - DECLARE(llvm_hpvm_streamPush); - DECLARE(llvm_hpvm_streamPop); - DECLARE(llvm_hpvm_streamWait); - DECLARE(llvm_hpvm_createBindInBuffer); - DECLARE(llvm_hpvm_createBindOutBuffer); - DECLARE(llvm_hpvm_createEdgeBuffer); - DECLARE(llvm_hpvm_createLastInputBuffer); - DECLARE(llvm_hpvm_createThread); - // DECLARE(llvm_hpvm_freeThreads); - DECLARE(llvm_hpvm_bufferPush); - DECLARE(llvm_hpvm_bufferPop); - DECLARE(llvm_hpvm_x86_dstack_push); - DECLARE(llvm_hpvm_x86_dstack_pop); - DECLARE(llvm_hpvm_x86_getDimLimit); - DECLARE(llvm_hpvm_x86_getDimInstance); + DECLARE(llvm_visc_x86_wait); + DECLARE(llvm_visc_x86_argument_ptr); + DECLARE(llvm_visc_streamLaunch); + DECLARE(llvm_visc_streamPush); + DECLARE(llvm_visc_streamPop); + DECLARE(llvm_visc_streamWait); + DECLARE(llvm_visc_createBindInBuffer); + DECLARE(llvm_visc_createBindOutBuffer); + DECLARE(llvm_visc_createEdgeBuffer); + DECLARE(llvm_visc_createLastInputBuffer); + DECLARE(llvm_visc_createThread); + // DECLARE(llvm_visc_freeThreads); + DECLARE(llvm_visc_bufferPush); + DECLARE(llvm_visc_bufferPop); + DECLARE(llvm_visc_x86_dstack_push); + DECLARE(llvm_visc_x86_dstack_pop); + DECLARE(llvm_visc_x86_getDimLimit); + DECLARE(llvm_visc_x86_getDimInstance); // Get or insert timerAPI functions as well if you plan to use timers initTimerAPI(); // Insert init context in main - Function *VI = M.getFunction("llvm.hpvm.init"); - assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); + Function *VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); DEBUG(errs() << "Inserting x86 timer initialization\n"); Instruction *I = cast<Instruction>(*VI->user_begin()); initializeTimerSet(I); - switchToTimer(hpvm_TimerID_NONE, I); + switchToTimer(visc_TimerID_NONE, I); // Insert code for initializing the sceduling policy FunctionCallee IP = M.getOrInsertFunction( - "llvm_hpvm_policy_init", - runtimeModule->getFunction("llvm_hpvm_policy_init")->getFunctionType()); + "llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()); CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); @@ -239,22 +239,22 @@ void CGT_X86::initRuntimeAPI() { // device status simulation if (DeviceAbstraction) { FunctionCallee ID = M.getOrInsertFunction( - "llvm_hpvm_deviceAbstraction_start", - runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_start") + "llvm_visc_deviceAbstraction_start", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_start") ->getFunctionType()); CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); } - // Insert print instruction at hpvm exit - Function *VC = M.getFunction("llvm.hpvm.cleanup"); - assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once"); + // Insert print instruction at visc exit + Function *VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); // Insert code for clearing the sceduling policy I = cast<Instruction>(*VC->user_begin()); IP = M.getOrInsertFunction( - "llvm_hpvm_policy_clear", - runtimeModule->getFunction("llvm_hpvm_policy_clear")->getFunctionType()); + "llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()); IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IPCallInst << "\n"); @@ -265,8 +265,8 @@ void CGT_X86::initRuntimeAPI() { // device status simulation if (DeviceAbstraction) { FunctionCallee ID = M.getOrInsertFunction( - "llvm_hpvm_deviceAbstraction_end", - runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_end") + "llvm_visc_deviceAbstraction_end", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_end") ->getFunctionType()); CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I); DEBUG(errs() << *IDCallInst << "\n"); @@ -542,7 +542,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, // Call runtime to create the thread with these arguments DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); - // DEBUG(errs() << *llvm_hpvm_createThread << "\n"); + // DEBUG(errs() << *llvm_visc_createThread << "\n"); DEBUG(errs() << *graphID->getType() << "\n"); DEBUG(errs() << *C_Pipeline->getType() << "\n"); DEBUG(errs() << *Struct->getType() << "\n"); @@ -551,7 +551,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args, Struct->getName(), IB); Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI}; CallInst *CreateThread = CallInst::Create( - llvm_hpvm_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); + llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); } Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { @@ -639,17 +639,17 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition()); Value *BindInCallArgs[] = {graphID, size, Int_ArgNo}; CI = CallInst::Create( - llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), + llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI); } else if (Edge->getDestDF()->isExitNode()) { // Bind Output Edge CI = CallInst::Create( - llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI); } else { // Streaming Edge CI = CallInst::Create( - llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), Edge->getSourceDF()->getFuncPointer()->getName() + "." + Edge->getDestDF()->getFuncPointer()->getName(), RI); @@ -668,7 +668,7 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) { Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); Value *CallArgs[] = {graphID, size}; CallInst *CI = CallInst::Create( - llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), + llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI); NodeLastInputMap[child] = CI; } @@ -729,7 +729,7 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { DEBUG(errs() << "Substitute launch intrinsic\n"); Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)}; CallInst *LaunchInst = CallInst::Create( - llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), + llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), "graph" + Root->getFuncPointer()->getName(), LI); // ReplaceInstWithInst(LI, LaunchInst); @@ -742,16 +742,16 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) { CallInst *CI; Value *PushArgs[] = {LaunchInst, II->getOperand(1)}; switch (II->getIntrinsicID()) { - case Intrinsic::hpvm_wait: - CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst), + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::hpvm_push: - CI = CallInst::Create(llvm_hpvm_streamPush, + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_streamPush, ArrayRef<Value *>(PushArgs, 2), ""); break; - case Intrinsic::hpvm_pop: - CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst), + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst), ""); break; default: @@ -771,7 +771,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { DEBUG(errs() << "Generating Launch Function\n"); // Get Launch Instruction IntrinsicInst *LI = Root->getInstruction(); - switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI); + switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); DEBUG(errs() << "Generating Launch Function\n"); /* Now we have all the necessary global declarations necessary to generate the @@ -802,14 +802,14 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { ReturnInst *RI = ReturnInst::Create(AppFunc->getContext(), Constant::getNullValue(AppFunc->getReturnType()), BB); - switchToTimer(hpvm_TimerID_ARG_UNPACK, RI); + switchToTimer(visc_TimerID_ARG_UNPACK, RI); DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and // Function* RootF_X86 = Root->getGenFunc(); - Function *RootF_X86 = Root->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); - assert(Root->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && + assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && "Error: Generated Function for Root node with no x86 wrapper\n"); // Generate a call to RootF_X86 with null parameters for now @@ -837,8 +837,8 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { CI->setArgOperand(i, elements[i]); // Add timers around Call to RootF_X86 function - switchToTimer(hpvm_TimerID_COMPUTATION, CI); - switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI); + switchToTimer(visc_TimerID_COMPUTATION, CI); + switchToTimer(visc_TimerID_OUTPUT_PACK, RI); StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType()); @@ -888,7 +888,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { new StoreInst(CI, OutputAddrCast, RI); } - switchToTimer(hpvm_TimerID_NONE, RI); + switchToTimer(visc_TimerID_NONE, RI); DEBUG(errs() << "Application specific function:\n"); DEBUG(errs() << *AppFunc << "\n"); @@ -896,7 +896,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { // Substitute launch intrinsic main Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)}; CallInst *LaunchInst = CallInst::Create( - llvm_hpvm_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), + llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2), "graph" + Root->getFuncPointer()->getName(), LI); // ReplaceInstWithInst(LI, LaunchInst); @@ -907,16 +907,16 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) { IntrinsicInst *II = UseList->at(i); CallInst *CI; switch (II->getIntrinsicID()) { - case Intrinsic::hpvm_wait: - CI = CallInst::Create(llvm_hpvm_x86_wait, ArrayRef<Value *>(LaunchInst), + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::hpvm_push: - CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst), + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst), ""); break; - case Intrinsic::hpvm_pop: - CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst), + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst), ""); break; default: @@ -970,10 +970,10 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, Function *CF = C->getFuncPointer(); // Function* CF_X86 = C->getGenFunc(); - Function *CF_X86 = C->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); assert(CF_X86 != NULL && "Found leaf node for which code generation has not happened yet!\n"); - assert(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && + assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && "The generated function to be called from x86 backend is not an x86 " "function\n"); DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); @@ -1040,7 +1040,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, CI->getArgOperand(numArgs - 6 + 2) // iZ }; - CallInst *Push = CallInst::Create(llvm_hpvm_x86_dstack_push, + CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value *>(args, 7), "", CI); DEBUG(errs() << "Push on stack: " << *Push << "\n"); // Insert call to runtime to pop the dim limits and instanceID from the depth @@ -1053,7 +1053,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86, assert(NextI->getParent() == CI->getParent() && "Next Instruction should also belong to the same basic block!"); - CallInst *Pop = CallInst::Create(llvm_hpvm_x86_dstack_pop, None, "", NextI); + CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); DEBUG(errs() << *CI->getParent()->getParent()); } @@ -1156,7 +1156,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { "streaming input edges\n"); // First read the termination condition variable islastInput CallInst *isLastInputPop = CallInst::Create( - llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); + llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); CastInst *BI = BitCastInst::CreateIntegerCast( isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false, @@ -1173,7 +1173,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { ++i) { if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { CallInst *bufferIn = - CallInst::Create(llvm_hpvm_bufferPop, + CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI); CastInst *BI; if (i->getType()->isPointerTy()) { @@ -1196,7 +1196,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); // CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, // C->getGenFunc()->getName()+".output", RI); - Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); DEBUG(errs() << "Type: " << *CGenF->getType() << "\n"); CallInst *CI = CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI); @@ -1222,7 +1222,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // Push to Output buffer Value *bufferOutArgs[] = {OutputArgs[i], BI}; CallInst *bufferOut = CallInst::Create( - llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); + llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI); } // Add loop around the basic block, which exits the loop if isLastInput is @@ -1236,9 +1236,9 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { CondBB = CondStartI->getParent(); BodyBB = CI->getParent(); Instruction *CntI = NULL; - CallInst *GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(CGenF); + CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); - // If the node function calls the hpvm runtime call to get policy, we update + // If the node function calls the visc runtime call to get policy, we update // it with the counter information. This means we need to pass an additional // argument to the generated function, that is the iteration number, and then // use it as an argument to the policy_getVersion call @@ -1255,14 +1255,14 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { } NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); - Function *NewCGenF = hpvmUtils::cloneFunction(CGenF, NewFT, false); + Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); // At least one (the last) argument exists (we added it) Function::arg_iterator ae = NewCGenF->arg_end(); --ae; Argument *CntArg = &*ae; CntArg->setName("iteration"); // Replace the old cpu gen func with this one - C->addGenFunc(NewCGenF, hpvm::CPU_TARGET, true); + C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); // Add counter to the actual parameter list, to create the new call InputArgs.push_back(CntI); @@ -1272,7 +1272,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) { // Set second operand of the policy_getVersion call to the last function // argument - GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(NewCGenF); + GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); GetPolicyCI->setArgOperand(1, CntArg); } @@ -1292,13 +1292,13 @@ void CGT_X86::codeGen(DFInternalNode *N) { // function before and nothing else needs to be done for this leaf node. // if(N->getGenFunc() != NULL) // return; - if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << " : skipping it\n"); return; } - assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && "Error: Visiting a node for which code already generated\n"); // Sort children in topological order before code generation @@ -1315,7 +1315,7 @@ void CGT_X86::codeGen(DFInternalNode *N) { if (C->isDummyNode()) continue; - if (!(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET))) { + if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { errs() << "No CPU x86 version for child node " << C->getFuncPointer()->getName() << "\n Skip code gen for parent node " @@ -1361,8 +1361,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { RI = cast<ReturnInst>(BB->getTerminator()); // Add generated function info to DFNode - // N->setGenFunc(F_X86, hpvm::CPU_TARGET); - N->addGenFunc(F_X86, hpvm::CPU_TARGET, true); + // N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Loop over the arguments, to create the VMap. dest_iterator = F_X86->arg_begin(); @@ -1445,13 +1445,13 @@ void CGT_X86::codeGen(DFInternalNode *N) { // If not, we see which version exists, check that it is in fact an x86 // function and save it as the CPU_TARGET function - // TODO: hpvm_id per node, so we can use this for id for policies + // TODO: visc_id per node, so we can use this for id for policies // For now, use node function name and change it later - Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); - bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " << N->getTag() << "\n"); @@ -1460,7 +1460,7 @@ void CGT_X86::codeGen(DFInternalNode *N) { DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"); - if (N->getTag() == hpvm::None) { + if (N->getTag() == visc::None) { // No code is available for this node. This (usually) means that this // node is a node that // - from the accelerator backends has been mapped to an intermediate @@ -1469,24 +1469,24 @@ void CGT_X86::codeGen(DFInternalNode *N) { // take place DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node " << N->getFuncPointer()->getName() << "\n"); - } else if (hpvmUtils::isSingleTargetTag(N->getTag())) { + } else if (viscUtils::isSingleTargetTag(N->getTag())) { // There is a single version for this node according to code gen hints. // Therefore, we do not need to check the policy, we simply use the // available implementation, whichever target it is for. // Sanity check - to be removed TODO switch (N->getTag()) { - case hpvm::CPU_TARGET: - assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && ""); + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); break; - case hpvm::GPU_TARGET: - assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET) && ""); + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); break; default: assert(false && "Unreachable: we checked that tag was single target!\n"); @@ -1499,8 +1499,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { if (DeviceAbstraction) { Function *NodeGenFunc = NULL; switch (N->getTag()) { - case hpvm::GPU_TARGET: - NodeGenFunc = N->getGenFuncForTarget(hpvm::GPU_TARGET); + case visc::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); break; default: break; @@ -1512,9 +1512,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { BasicBlock *BB = &*NodeGenFunc->begin(); std::vector<Value *> Args; // TODO: add the device type as argument? FunctionCallee RTF = M.getOrInsertFunction( - "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", + "llvm_visc_deviceAbstraction_waitOnDeviceStatus", runtimeModule - ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus") + ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); @@ -1522,17 +1522,17 @@ void CGT_X86::codeGen(DFInternalNode *N) { } Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - N->removeGenFuncForTarget(hpvm::GPU_TARGET); - N->setTag(hpvm::None); - N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true); - N->setTag(hpvm::CPU_TARGET); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); // Sanity checks - to be removed TODO - CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); - GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); - CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); - GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); DEBUG(errs() << "After editing\n"); DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " @@ -1545,11 +1545,11 @@ void CGT_X86::codeGen(DFInternalNode *N) { DEBUG(errs() << "Node Name (for policy) : " << N->getFuncPointer()->getName() << "\n"); - Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); - bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET); + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); // These assertions express what we can support with the current runtime. // Code generation works the same way even for other target combinations. @@ -1610,8 +1610,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { Args.push_back( ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); FunctionCallee RTF = M.getOrInsertFunction( - "llvm_hpvm_policy_getVersion", - runtimeModule->getFunction("llvm_hpvm_policy_getVersion") + "llvm_visc_policy_getVersion", + runtimeModule->getFunction("llvm_visc_policy_getVersion") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); @@ -1646,9 +1646,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { // call std::vector<Value *> Args; // TODO: add the device type as argument? FunctionCallee RTF = M.getOrInsertFunction( - "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", + "llvm_visc_deviceAbstraction_waitOnDeviceStatus", runtimeModule - ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus") + ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus") ->getFunctionType()); CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); } @@ -1673,8 +1673,8 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Prepare arguments and function for call to wait for device runtime call // std::vector<Value *> Args; // TODO: add the device type as argument? // FunctionCallee RTF = - // M.getOrInsertFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus", - // runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); + // M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()); // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); // } // } @@ -1684,9 +1684,9 @@ void CGT_X86::codeGen(DFInternalNode *N) { // Now, make the node cpu gen func to be this one // Remove all other versions and update the tag - N->addGenFunc(F_wrapper, hpvm::CPU_TARGET, true); - N->removeGenFuncForTarget(hpvm::GPU_TARGET); - N->setTag(hpvm::CPU_TARGET); + N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->setTag(visc::CPU_TARGET); // assert(false && "got to the point where we have to combine\n"); } @@ -1715,7 +1715,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // if(N->getGenFunc() != NULL) // return; - if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << " : skipping it\n"); @@ -1723,10 +1723,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { << N->getFuncPointer()->getName() << "\n"); switch (N->getTag()) { - case hpvm::GPU_TARGET: + case visc::GPU_TARGET: // A leaf node should not have an x86 function for GPU // by design of DFG2LLVM_NVPTX backend - assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); break; default: break; @@ -1735,7 +1735,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { return; } - assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && "Error: Visiting a node for which code already generated\n"); std::vector<IntrinsicInst *> IItoRemove; @@ -1759,8 +1759,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { F_X86 = addIdxDimArgs(F_X86); // Add generated function info to DFNode - // N->setGenFunc(F_X86, hpvm::CPU_TARGET); - N->addGenFunc(F_X86, hpvm::CPU_TARGET, true); + // N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Go through the arguments, and any pointer arguments with in attribute need // to have x86_argument_ptr call to get the x86 ptr of the argument @@ -1768,7 +1768,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // Create new BB BasicBlock *EntryBB = &*F_X86->begin(); BasicBlock *BB = - BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_X86, EntryBB); + BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); BranchInst *Terminator = BranchInst::Create(EntryBB, BB); // Insert calls for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); @@ -1776,7 +1776,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1, Attribute::In)) { assert(ai->getType()->isPointerTy() && - "Only pointer arguments can have hpvm in/out attributes "); + "Only pointer arguments can have visc in/out attributes "); Function::arg_iterator aiNext = ai; ++aiNext; Argument *size = &*aiNext; @@ -1786,7 +1786,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr", Terminator); Value *ArgPtrCallArgs[] = {BI, size}; - CallInst::Create(llvm_hpvm_x86_argument_ptr, + CallInst::Create(llvm_visc_x86_argument_ptr, ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator); } } @@ -1796,30 +1796,30 @@ void CGT_X86::codeGen(DFLeafNode *N) { for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { Instruction *I = &(*i); DEBUG(errs() << *I << "\n"); - // Leaf nodes should not contain HPVM graph intrinsics or launch - assert(!BuildDFG::isHPVMLaunchIntrinsic(I) && + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isHPVMGraphIntrinsic(I) && - "HPVM graph intrinsic within a leaf dataflow node!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && + "VISC graph intrinsic within a leaf dataflow node!"); - if (BuildDFG::isHPVMQueryIntrinsic(I)) { + if (BuildDFG::isViscQueryIntrinsic(I)) { IntrinsicInst *II = cast<IntrinsicInst>(I); IntrinsicInst *ArgII; DFNode *ArgDFNode; /*********************************************************************** - * Handle HPVM Query intrinsics * + * Handle VISC Query intrinsics * ***********************************************************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.hpvm.getNode() *******************/ - case Intrinsic::hpvm_getNode: { + /**************************** llvm.visc.getNode() *******************/ + case Intrinsic::visc_getNode: { // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); break; } - /************************* llvm.hpvm.getParentNode() ****************/ - case Intrinsic::hpvm_getParentNode: { + /************************* llvm.visc.getParentNode() ****************/ + case Intrinsic::visc_getParentNode: { // get the parent node of the arg node // get argument node ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -1832,8 +1832,8 @@ void CGT_X86::codeGen(DFLeafNode *N) { IItoRemove.push_back(II); break; } - /*************************** llvm.hpvm.getNumDims() *****************/ - case Intrinsic::hpvm_getNumDims: { + /*************************** llvm.visc.getNumDims() *****************/ + case Intrinsic::visc_getNumDims: { // get node from map // get the appropriate field ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); @@ -1846,10 +1846,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { IItoRemove.push_back(II); break; } - /*********************** llvm.hpvm.getNodeInstanceID() **************/ - case Intrinsic::hpvm_getNodeInstanceID_x: - case Intrinsic::hpvm_getNodeInstanceID_y: - case Intrinsic::hpvm_getNodeInstanceID_z: { + /*********************** llvm.visc.getNodeInstanceID() **************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; @@ -1864,7 +1864,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // (dim = 1) => y // (dim = 2) => z int dim = - (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x); + (int)(II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x); assert((dim >= 0) && (dim < 3) && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic " "ID!"); @@ -1894,7 +1894,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { Value *args[] = { ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimInstance, + CallInst *CI = CallInst::Create(llvm_visc_x86_getDimInstance, ArrayRef<Value *>(args, 2), "nodeInstanceID", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); @@ -1903,10 +1903,10 @@ void CGT_X86::codeGen(DFLeafNode *N) { } break; } - /********************** llvm.hpvm.getNumNodeInstances() *************/ - case Intrinsic::hpvm_getNumNodeInstances_x: - case Intrinsic::hpvm_getNumNodeInstances_y: - case Intrinsic::hpvm_getNumNodeInstances_z: { + /********************** llvm.visc.getNumNodeInstances() *************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; @@ -1922,7 +1922,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { // (dim = 1) => y // (dim = 2) => z int dim = - (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x); + (int)(II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x); assert((dim >= 0) && (dim < 3) && "Invalid dimension for getNumNodeInstances_[xyz]. Check " "Intrinsic ID!"); @@ -1952,7 +1952,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { Value *args[] = { ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; - CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimLimit, + CallInst *CI = CallInst::Create(llvm_visc_x86_getDimLimit, ArrayRef<Value *>(args, 2), "numNodeInstances", II); DEBUG(errs() << *II << " replaced with " << *CI << "\n"); @@ -1965,7 +1965,7 @@ void CGT_X86::codeGen(DFLeafNode *N) { default: DEBUG(errs() << "Found unknown intrinsic with ID = " << II->getIntrinsicID() << "\n"); - assert(false && "Unknown HPVM Intrinsic!"); + assert(false && "Unknown VISC Intrinsic!"); break; } diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp deleted file mode 100644 index 738b39905b..0000000000 --- a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp +++ /dev/null @@ -1,894 +0,0 @@ -//=== GenHPVM.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "genhpvm" -#include "GenHPVM/GenHPVM.h" - -#include "SupportHPVM/HPVMHint.h" -#include "SupportHPVM/HPVMUtils.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/CallSite.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" - -#define TIMER(X) \ - do { \ - if (HPVMTimer) { \ - X; \ - } \ - } while (0) - -using namespace llvm; -using namespace hpvmUtils; - -// HPVM Command line option to use timer or not -static cl::opt<bool> HPVMTimer("hpvm-timers-gen", - cl::desc("Enable GenHPVM timer")); - -namespace genhpvm { - -// Helper Functions - -static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID); -static Function *transformReturnTypeToStruct(Function *F); -static Type *getReturnTypeFromReturnInst(Function *F); - -// Check if the dummy function call is a __hpvm__node call -#define IS_HPVM_CALL(callName) \ - static bool isHPVMCall_##callName(Instruction *I) { \ - if (!isa<CallInst>(I)) \ - return false; \ - CallInst *CI = cast<CallInst>(I); \ - return (CI->getCalledValue()->stripPointerCasts()->getName()) \ - .equals("__hpvm__" #callName); \ - } - -static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID, - std::vector<Instruction *> *Erase) { - // Check if the instruction is Call Instruction - assert(isa<CallInst>(I) && "Expecting CallInst"); - CallInst *CI = cast<CallInst>(I); - DEBUG(errs() << "Found call: " << *CI << "\n"); - - // Find the correct intrinsic call - Module *M = CI->getParent()->getParent()->getParent(); - Function *F; - std::vector<Type *> ArgTypes; - std::vector<Value *> args; - if (Intrinsic::isOverloaded(IntrinsicID)) { - // This is an overloaded intrinsic. The types must exactly match. Get the - // argument types - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { - ArgTypes.push_back(CI->getArgOperand(i)->getType()); - args.push_back(CI->getArgOperand(i)); - } - F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); - DEBUG(errs() << *F << "\n"); - } else { // Non-overloaded intrinsic - F = Intrinsic::getDeclaration(M, IntrinsicID); - FunctionType *FTy = F->getFunctionType(); - DEBUG(errs() << *F << "\n"); - - // Create argument list - assert(CI->getNumArgOperands() == FTy->getNumParams() && - "Number of arguments of call do not match with Intrinsic"); - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { - Value *V = CI->getArgOperand(i); - // Either the type should match or both should be of pointer type - assert((V->getType() == FTy->getParamType(i) || - (V->getType()->isPointerTy() && - FTy->getParamType(i)->isPointerTy())) && - "Dummy function call argument does not match with Intrinsic " - "argument!"); - // If the types do not match, then both must be pointer type and pointer - // cast needs to be performed - if (V->getType() != FTy->getParamType(i)) { - V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); - } - args.push_back(V); - } - } - // Insert call instruction - CallInst *Inst = CallInst::Create( - F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); - - DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); - - CI->replaceAllUsesWith(Inst); - // If the previous instruction needs to be erased, insert it in the vector - // Erased - if (Erase != NULL) - Erase->push_back(CI); -} - -IS_HPVM_CALL(launch) /* Exists but not required */ -IS_HPVM_CALL(edge) /* Exists but not required */ -IS_HPVM_CALL(createNodeND) -// IS_HPVM_CALL(createNode) -// IS_HPVM_CALL(createNode1D) -// IS_HPVM_CALL(createNode2D) -// IS_HPVM_CALL(createNode3D) -IS_HPVM_CALL(bindIn) -IS_HPVM_CALL(bindOut) -IS_HPVM_CALL(push) -IS_HPVM_CALL(pop) -IS_HPVM_CALL(getNode) -IS_HPVM_CALL(getParentNode) -IS_HPVM_CALL(barrier) -IS_HPVM_CALL(malloc) -IS_HPVM_CALL(return ) -IS_HPVM_CALL(getNodeInstanceID_x) -IS_HPVM_CALL(getNodeInstanceID_y) -IS_HPVM_CALL(getNodeInstanceID_z) -IS_HPVM_CALL(getNumNodeInstances_x) -IS_HPVM_CALL(getNumNodeInstances_y) -IS_HPVM_CALL(getNumNodeInstances_z) -// Atomics -IS_HPVM_CALL(atomic_cmpxchg) -IS_HPVM_CALL(atomic_add) -IS_HPVM_CALL(atomic_sub) -IS_HPVM_CALL(atomic_xchg) -IS_HPVM_CALL(atomic_inc) -IS_HPVM_CALL(atomic_dec) -IS_HPVM_CALL(atomic_min) -IS_HPVM_CALL(atomic_max) -IS_HPVM_CALL(atomic_umin) -IS_HPVM_CALL(atomic_umax) -IS_HPVM_CALL(atomic_and) -IS_HPVM_CALL(atomic_or) -IS_HPVM_CALL(atomic_xor) -// Misc Fn -IS_HPVM_CALL(floor) -IS_HPVM_CALL(rsqrt) -IS_HPVM_CALL(sqrt) -IS_HPVM_CALL(sin) -IS_HPVM_CALL(cos) - -IS_HPVM_CALL(init) -IS_HPVM_CALL(cleanup) -IS_HPVM_CALL(wait) -IS_HPVM_CALL(trackMemory) -IS_HPVM_CALL(untrackMemory) -IS_HPVM_CALL(requestMemory) -IS_HPVM_CALL(attributes) -IS_HPVM_CALL(hint) - -// Return the constant integer represented by value V -static unsigned getNumericValue(Value *V) { - assert( - isa<ConstantInt>(V) && - "Value indicating the number of arguments should be a constant integer"); - return cast<ConstantInt>(V)->getZExtValue(); -} - -// Take the __hpvm__return instruction and generate code for combining the -// values being returned into a struct and returning it. -// The first operand is the number of returned values -static Value *genCodeForReturn(CallInst *CI) { - LLVMContext &Ctx = CI->getContext(); - assert(isHPVMCall_return(CI) && "__hpvm__return instruction expected!"); - - // Parse the dummy function call here - assert(CI->getNumArgOperands() > 0 && - "Too few arguments for __hpvm_return call!\n"); - unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); - - assert(CI->getNumArgOperands() - 1 == numRetVals && - "Too few arguments for __hpvm_return call!\n"); - DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); - - std::vector<Type *> ArgTypes; - for (unsigned i = 1; i < CI->getNumArgOperands(); i++) { - ArgTypes.push_back(CI->getArgOperand(i)->getType()); - } - Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); - StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); - - InsertValueInst *IV = InsertValueInst::Create( - UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI); - DEBUG(errs() << "Code generation for return:\n"); - DEBUG(errs() << *IV << "\n"); - - for (unsigned i = 2; i < CI->getNumArgOperands(); i++) { - IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(), - CI); - DEBUG(errs() << *IV << "\n"); - } - - return IV; -} - -// Analyse the attribute call for this function. Add the in and out -// attributes to pointer parameters. -static void handleHPVMAttributes(Function *F, CallInst *CI) { - DEBUG(errs() << "Kernel before adding In/Out HPVM attributes:\n" - << *F << "\n"); - // Parse the dummy function call here - unsigned offset = 0; - // Find number of In pointers - assert(CI->getNumArgOperands() > offset && - "Too few arguments for __hpvm__attributes call!"); - unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); - DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); - - for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) { - Value *V = CI->getArgOperand(i); - if (Argument *arg = dyn_cast<Argument>(V)) { - F->addAttribute(1 + arg->getArgNo(), Attribute::In); - } else { - DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n"); - llvm_unreachable( - "Only pointer arguments can be passed to __hpvm__attributes call"); - } - } - // Find number of Out Pointers - offset += 1 + numInPtrs; - assert(CI->getNumArgOperands() > offset && - "Too few arguments for __hpvm__attributes call!"); - unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); - DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); - for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) { - Value *V = CI->getArgOperand(i); - if (Argument *arg = dyn_cast<Argument>(V)) { - F->addAttribute(1 + arg->getArgNo(), Attribute::Out); - } else { - DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n"); - llvm_unreachable( - "Only pointer arguments can be passed to __hpvm__attributes call"); - } - } - DEBUG(errs() << "Kernel after adding In/Out HPVM attributes:\n" - << *F << "\n"); -} - -// Public Functions of GenHPVM pass -bool GenHPVM::runOnModule(Module &M) { - DEBUG(errs() << "\nGENHPVM PASS\n"); - this->M = &M; - - // Load Runtime API Module - SMDiagnostic Err; - - char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); - assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); - - Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = - llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc"; - DEBUG(errs() << llvmSrcRoot << "\n"); - - std::unique_ptr<Module> runtimeModule = - parseIRFile(runtimeAPI.str(), Err, M.getContext()); - - if (runtimeModule == NULL) { - DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); - assert(false && "couldn't parse runtime"); - } else - DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); - - llvm_hpvm_initializeTimerSet = M.getOrInsertFunction( - "llvm_hpvm_initializeTimerSet", - runtimeModule->getFunction("llvm_hpvm_initializeTimerSet") - ->getFunctionType()); - // DEBUG(errs() << *llvm_hpvm_initializeTimerSet); - - llvm_hpvm_switchToTimer = M.getOrInsertFunction( - "llvm_hpvm_switchToTimer", - runtimeModule->getFunction("llvm_hpvm_switchToTimer")->getFunctionType()); - // DEBUG(errs() << *llvm_hpvm_switchToTimer); - - llvm_hpvm_printTimerSet = M.getOrInsertFunction( - "llvm_hpvm_printTimerSet", - runtimeModule->getFunction("llvm_hpvm_printTimerSet")->getFunctionType()); - // DEBUG(errs() << *llvm_hpvm_printTimerSet); - - // Insert init context in main - DEBUG(errs() << "Locate __hpvm__init()\n"); - Function *VI = M.getFunction("__hpvm__init"); - assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); - Instruction *I = cast<Instruction>(*VI->user_begin()); - - DEBUG(errs() << "Initialize Timer Set\n"); - initializeTimerSet(I); - switchToTimer(hpvm_TimerID_NONE, I); - - // Insert print instruction at hpvm exit - DEBUG(errs() << "Locate __hpvm__cleanup()\n"); - Function *VC = M.getFunction("__hpvm__cleanup"); - assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once"); - I = cast<Instruction>(*VC->user_begin()); - printTimerSet(I); - - DEBUG(errs() << "-------- Searching for launch sites ----------\n"); - - std::vector<Instruction *> toBeErased; - std::vector<Function *> functions; - - for (auto &F : M) - functions.push_back(&F); - - // Iterate over all functions in the module - for (Function *f : functions) { - DEBUG(errs() << "Function: " << f->getName() << "\n"); - - // List with the required additions in the function's return type - std::vector<Type *> FRetTypes; - - enum mutateTypeCause { - mtc_None, - mtc_BIND, - mtc_RETURN, - mtc_NUM_CAUSES - } bind; - bind = mutateTypeCause::mtc_None; - - // Iterate over all the instructions in this function - for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) { - Instruction *I = &*i; // Grab pointer to Instruction - // If not a call instruction, move to next instruction - if (!isa<CallInst>(I)) - continue; - - CallInst *CI = cast<CallInst>(I); - LLVMContext &Ctx = CI->getContext(); - - if (isHPVMCall_init(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_init, &toBeErased); - } - if (isHPVMCall_cleanup(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_cleanup, &toBeErased); - } - if (isHPVMCall_wait(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_wait, &toBeErased); - } - if (isHPVMCall_trackMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_trackMemory, &toBeErased); - } - if (isHPVMCall_untrackMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_untrackMemory, &toBeErased); - } - if (isHPVMCall_requestMemory(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_requestMemory, &toBeErased); - } - if (isHPVMCall_hint(I)) { - assert(isa<ConstantInt>(CI->getArgOperand(0)) && - "Argument to hint must be constant integer!"); - ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0)); - - hpvm::Target t = (hpvm::Target)hint->getZExtValue(); - addHint(CI->getParent()->getParent(), t); - DEBUG(errs() << "Found hpvm hint call: " << *CI << "\n"); - toBeErased.push_back(CI); - } - if (isHPVMCall_launch(I)) { - Function *LaunchF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_launch); - DEBUG(errs() << *LaunchF << "\n"); - // Get i8* cast to function pointer - Function *graphFunc = cast<Function>(CI->getArgOperand(1)); - graphFunc = transformReturnTypeToStruct(graphFunc); - Constant *F = - ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - assert( - F && - "Function invoked by HPVM launch has to be define and constant."); - - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0)); - assert(Op && "HPVM launch's streaming argument is a constant value."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - - auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); - assert(ArgTy && "HPVM launch argument should be pointer type."); - Value *Arg = CI->getArgOperand(2); - if (!ArgTy->getElementType()->isIntegerTy(8)) - Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), - Type::getInt8PtrTy(Ctx), "", CI); - Value *LaunchArgs[] = {F, Arg, isStreaming}; - CallInst *LaunchInst = CallInst::Create( - LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI); - DEBUG(errs() << "Found hpvm launch call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); - CI->replaceAllUsesWith(LaunchInst); - toBeErased.push_back(CI); - } - if (isHPVMCall_push(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_push, &toBeErased); - } - if (isHPVMCall_pop(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_pop, &toBeErased); - } - if (isHPVMCall_createNodeND(I)) { - assert(CI->getNumArgOperands() > 0 && - "Too few arguments for __hpvm__createNodeND call"); - unsigned numDims = getNumericValue(CI->getArgOperand(0)); - // We need as meny dimension argments are there are dimensions - assert(CI->getNumArgOperands() - 2 == numDims && - "Too few arguments for __hpvm_createNodeND call!\n"); - - Function *CreateNodeF; - switch (numDims) { - case 0: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode); - break; - case 1: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode1D); - break; - case 2: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode2D); - break; - case 3: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode3D); - break; - default: - llvm_unreachable("Unsupported number of dimensions\n"); - break; - } - DEBUG(errs() << *CreateNodeF << "\n"); - DEBUG(errs() << *I << "\n"); - DEBUG(errs() << "in " << I->getParent()->getParent()->getName() - << "\n"); - - // Get i8* cast to function pointer - Function *graphFunc = cast<Function>(CI->getArgOperand(1)); - graphFunc = transformReturnTypeToStruct(graphFunc); - Constant *F = - ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - - CallInst *CreateNodeInst; - switch (numDims) { - case 0: - CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F), - graphFunc->getName() + ".node", CI); - break; - case 1: { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2), - graphFunc->getName() + ".node", CI); - } break; - case 2: { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 3, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), - CI->getArgOperand(3)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3), - graphFunc->getName() + ".node", CI); - } break; - case 3: { - assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 2, expected to be i64\n"); - assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 3, expected to be i64\n"); - assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && - "CreateNodeND dimension argument, 4, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), - CI->getArgOperand(3), - CI->getArgOperand(4)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4), - graphFunc->getName() + ".node", CI); - } break; - default: - llvm_unreachable( - "Impossible path: number of dimensions is 0, 1, 2, 3\n"); - break; - } - - DEBUG(errs() << "Found hpvm createNode call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n"); - CI->replaceAllUsesWith(CreateNodeInst); - toBeErased.push_back(CI); - } - - if (isHPVMCall_edge(I)) { - Function *EdgeF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createEdge); - DEBUG(errs() << *EdgeF << "\n"); - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5)); - ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); - assert(Op && EdgeTypeOp && - "Arguments of CreateEdge are not constant integers."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - isAllToAll, CI->getArgOperand(3), - CI->getArgOperand(4), isStreaming}; - CallInst *EdgeInst = CallInst::Create( - EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI); - DEBUG(errs() << "Found hpvm edge call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); - CI->replaceAllUsesWith(EdgeInst); - toBeErased.push_back(CI); - } - if (isHPVMCall_bindIn(I)) { - Function *BindInF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_input); - DEBUG(errs() << *BindInF << "\n"); - // Check if this is a streaming bind or not - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind in intrinsic should be a " - "constant integer."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming}; - CallInst *BindInInst = - CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI); - DEBUG(errs() << "Found hpvm bindIn call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); - CI->replaceAllUsesWith(BindInInst); - toBeErased.push_back(CI); - } - if (isHPVMCall_bindOut(I)) { - Function *BindOutF = - Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_output); - DEBUG(errs() << *BindOutF << "\n"); - // Check if this is a streaming bind or not - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind out intrinsic should be a " - "constant integer."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming}; - CallInst *BindOutInst = CallInst::Create( - BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI); - DEBUG(errs() << "Found hpvm bindOut call: " << *CI << "\n"); - DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); - - DEBUG(errs() << "Fixing the return type of the function\n"); - // FIXME: What if the child node function has not been visited already. - // i.e., it's return type has not been fixed. - Function *F = I->getParent()->getParent(); - DEBUG(errs() << F->getName() << "\n";); - IntrinsicInst *NodeIntrinsic = - cast<IntrinsicInst>(CI->getArgOperand(0)); - assert(NodeIntrinsic && - "Instruction value in bind out is not a create node intrinsic."); - DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); - assert( - (NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode || - NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode1D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode2D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode3D) && - "Instruction value in bind out is not a create node intrinsic."); - Function *ChildF = cast<Function>( - NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); - DEBUG(errs() << ChildF->getName() << "\n";); - int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); - int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); - StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType()); - - Type *ReturnType = F->getReturnType(); - DEBUG(errs() << *ReturnType << "\n";); - assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) && - "Return type should either be a struct or void type!"); - - FRetTypes.insert(FRetTypes.begin() + destpos, - ChildReturnTy->getElementType(srcpos)); - assert(((bind == mutateTypeCause::mtc_BIND) || - (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and hpvm_return detected"); - bind = mutateTypeCause::mtc_BIND; - - CI->replaceAllUsesWith(BindOutInst); - toBeErased.push_back(CI); - } - if (isHPVMCall_attributes(I)) { - Function *F = CI->getParent()->getParent(); - handleHPVMAttributes(F, CI); - toBeErased.push_back(CI); - } - if (isHPVMCall_getNode(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNode, &toBeErased); - } - if (isHPVMCall_getParentNode(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getParentNode, &toBeErased); - } - if (isHPVMCall_barrier(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_barrier, &toBeErased); - } - if (isHPVMCall_malloc(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_malloc, &toBeErased); - } - if (isHPVMCall_return(I)) { - DEBUG(errs() << "Function before hpvm return processing\n" - << *I->getParent()->getParent() << "\n"); - // The operands to this call are the values to be returned by the node - Value *ReturnVal = genCodeForReturn(CI); - DEBUG(errs() << *ReturnVal << "\n"); - Type *ReturnType = ReturnVal->getType(); - assert(isa<StructType>(ReturnType) && - "Return type should be a struct type!"); - - assert(((bind == mutateTypeCause::mtc_RETURN) || - (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and hpvm_return detected"); - - if (bind == mutateTypeCause::mtc_None) { - // If this is None, this is the first __hpvm__return - // instruction we have come upon. Place the return type of the - // function in the return type vector - bind = mutateTypeCause::mtc_RETURN; - StructType *ReturnStructTy = cast<StructType>(ReturnType); - for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) - FRetTypes.push_back(ReturnStructTy->getElementType(i)); - } else { // bind == mutateTypeCause::mtc_RETURN - // This is not the first __hpvm__return - // instruction we have come upon. - // Check that the return types are the same - assert((ReturnType == FRetTypes[0]) && - "Multiple returns with mismatching types"); - } - - ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal); - DEBUG(errs() << "Found hpvm return call: " << *CI << "\n"); - Instruction *oldReturn = CI->getParent()->getTerminator(); - assert(isa<ReturnInst>(oldReturn) && - "Expecting a return to be the terminator of this BB!"); - DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); - DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); - // CI->replaceAllUsesWith(RetInst); - toBeErased.push_back(CI); - ReplaceInstWithInst(oldReturn, RetInst); - DEBUG(errs() << "Function after hpvm return processing\n" - << *I->getParent()->getParent() << "\n"); - } - - if (isHPVMCall_getNodeInstanceID_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_x, - &toBeErased); - } - if (isHPVMCall_getNodeInstanceID_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_y, - &toBeErased); - } - if (isHPVMCall_getNodeInstanceID_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_z, - &toBeErased); - } - if (isHPVMCall_getNumNodeInstances_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_x, - &toBeErased); - } - if (isHPVMCall_getNumNodeInstances_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_y, - &toBeErased); - } - if (isHPVMCall_getNumNodeInstances_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_z, - &toBeErased); - } - if (isHPVMCall_atomic_add(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_add, &toBeErased); - } - if (isHPVMCall_atomic_sub(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_sub, &toBeErased); - } - if (isHPVMCall_atomic_xchg(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xchg, &toBeErased); - } - if (isHPVMCall_atomic_min(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_min, &toBeErased); - } - if (isHPVMCall_atomic_max(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_max, &toBeErased); - } - if (isHPVMCall_atomic_and(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_and, &toBeErased); - } - if (isHPVMCall_atomic_or(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_or, &toBeErased); - } - if (isHPVMCall_atomic_xor(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xor, &toBeErased); - } - if (isHPVMCall_sin(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); - } - if (isHPVMCall_cos(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased); - } - } - - // Erase the __hpvm__node calls - DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); - for (auto I : toBeErased) { - DEBUG(errs() << *I << "\n"); - } - while (!toBeErased.empty()) { - Instruction *I = toBeErased.back(); - DEBUG(errs() << "\tErasing " << *I << "\n"); - I->eraseFromParent(); - toBeErased.pop_back(); - } - - if (bind == mutateTypeCause::mtc_BIND || - bind == mutateTypeCause::mtc_RETURN) { - DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); - // Argument type list. - std::vector<Type *> FArgTypes; - for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); - ai != ae; ++ai) { - FArgTypes.push_back(ai->getType()); - } - - // Find new return type of function - Type *NewReturnTy; - if (bind == mutateTypeCause::mtc_BIND) { - - std::vector<Type *> TyList; - for (unsigned i = 0; i < FRetTypes.size(); i++) - TyList.push_back(FRetTypes[i]); - - NewReturnTy = - StructType::create(f->getContext(), TyList, - Twine("struct.out." + f->getName()).str(), true); - } else { - NewReturnTy = getReturnTypeFromReturnInst(f); - assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); - } - - FunctionType *FTy = - FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); - - // Change the function type - Function *newF = cloneFunction(f, FTy, false); - DEBUG(errs() << *newF << "\n"); - - if (bind == mutateTypeCause::mtc_BIND) { - // This is certainly an internal node, and hence just one BB with one - // return terminator instruction. Change return statement - ReturnInst *RI = - cast<ReturnInst>(newF->getEntryBlock().getTerminator()); - ReturnInst *newRI = ReturnInst::Create(newF->getContext(), - UndefValue::get(NewReturnTy)); - ReplaceInstWithInst(RI, newRI); - } - if (bind == mutateTypeCause::mtc_RETURN) { - // Nothing - } - replaceNodeFunctionInIR(*f->getParent(), f, newF); - DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); - } - } - return false; // TODO: What does returning "false" mean? -} - -// Generate Code for declaring a constant string [L x i8] and return a pointer -// to the start of it. -Value *GenHPVM::getStringPointer(const Twine &S, Instruction *IB, - const Twine &Name) { - Constant *SConstant = - ConstantDataArray::getString(M->getContext(), S.str(), true); - Value *SGlobal = - new GlobalVariable(*M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); - Value *GEPArgs[] = {Zero, Zero}; - GetElementPtrInst *SPtr = GetElementPtrInst::Create( - nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB); - return SPtr; -} - -void GenHPVM::initializeTimerSet(Instruction *InsertBefore) { - Value *TimerSetAddr; - StoreInst *SI; - TIMER(TimerSet = new GlobalVariable( - *M, Type::getInt8PtrTy(M->getContext()), false, - GlobalValue::CommonLinkage, - Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), - "hpvmTimerSet_GenHPVM")); - DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet - << "\n"); - // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet << - // "\n"); - - TIMER(TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "", - InsertBefore)); - DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); - TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); - DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); -} - -void GenHPVM::switchToTimer(enum hpvm_TimerID timer, - Instruction *InsertBefore) { - Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)}; - TIMER(CallInst::Create(llvm_hpvm_switchToTimer, - ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); -} - -void GenHPVM::printTimerSet(Instruction *InsertBefore) { - Value *TimerName; - TIMER(TimerName = getStringPointer("GenHPVM_Timer", InsertBefore)); - Value *printArgs[] = {TimerSet, TimerName}; - TIMER(CallInst::Create(llvm_hpvm_printTimerSet, - ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); -} - -static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) { - return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); -} - -static Function *transformReturnTypeToStruct(Function *F) { - // Currently only works for void return types - DEBUG(errs() << "Transforming return type of function to Struct: " - << F->getName() << "\n"); - - if (isa<StructType>(F->getReturnType())) { - DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " - << *F->getReturnType() << "\n"); - return F; - } - - assert(F->getReturnType()->isVoidTy() && - "Unhandled case - Only void return type handled\n"); - - // Create the argument type list with added argument types - std::vector<Type *> ArgTypes; - for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - - StructType *RetTy = - StructType::create(F->getContext(), None, "emptyStruct", true); - FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); - - SmallVector<ReturnInst *, 8> Returns; - Function *newF = cloneFunction(F, FTy, false, &Returns); - // Replace ret void instruction with ret %RetTy undef - for (auto &RI : Returns) { - DEBUG(errs() << "Found return inst: " << *RI << "\n"); - ReturnInst *newRI = - ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); - ReplaceInstWithInst(RI, newRI); - } - - replaceNodeFunctionInIR(*F->getParent(), F, newF); - return newF; -} - -static Type *getReturnTypeFromReturnInst(Function *F) { - for (BasicBlock &BB : *F) { - if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) { - DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() - << "\n"); - return RI->getReturnValue()->getType(); - } - } -} - -char genhpvm::GenHPVM::ID = 0; -static RegisterPass<genhpvm::GenHPVM> - X("genhpvm", - "Pass to generate HPVM IR from LLVM IR (with dummy function calls)", - false, false); - -} // End of namespace genhpvm diff --git a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt b/hpvm/lib/Transforms/GenVISC/CMakeLists.txt similarity index 74% rename from hpvm/lib/Transforms/GenHPVM/CMakeLists.txt rename to hpvm/lib/Transforms/GenVISC/CMakeLists.txt index 967766e705..ed087f63b4 100644 --- a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt +++ b/hpvm/lib/Transforms/GenVISC/CMakeLists.txt @@ -2,9 +2,9 @@ if(WIN32 OR CYGWIN) set(LLVM_LINK_COMPONENTS Core Support) endif() -add_llvm_library( LLVMGenHPVM +add_llvm_library( LLVMGenVISC MODULE - GenHPVM.cpp + GenVISC.cpp DEPENDS intrinsics_gen diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp new file mode 100644 index 0000000000..cc50541539 --- /dev/null +++ b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp @@ -0,0 +1,866 @@ +//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "genvisc" +#include "GenVISC/GenVISC.h" + +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/IR/DerivedTypes.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "SupportVISC/VISCUtils.h" + + +#define TIMER(X) do { if (VISCTimer) { X; } } while (0) + +using namespace llvm; +using namespace viscUtils; + + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer")); + +namespace genvisc { + +// Helper Functions + +static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); +static Function* transformReturnTypeToStruct(Function* F); +static Type* getReturnTypeFromReturnInst(Function* F); + +// Check if the dummy function call is a __visc__node call +#define IS_VISC_CALL(callName) \ + static bool isVISCCall_##callName(Instruction* I) { \ + if(!isa<CallInst>(I)) \ + return false; \ + CallInst* CI = cast<CallInst>(I); \ + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \ + } + +static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) { + // Check if the instruction is Call Instruction + assert(isa<CallInst>(I) && "Expecting CallInst"); + CallInst* CI = cast<CallInst>(I); + DEBUG(errs() << "Found call: " << *CI << "\n"); + + // Find the correct intrinsic call + Module* M = CI->getParent()->getParent()->getParent(); + Function* F; + std::vector<Type*> ArgTypes; + std::vector<Value*> args; + if(Intrinsic::isOverloaded(IntrinsicID)) { + // This is an overloaded intrinsic. The types must exactly match. Get the + // argument types + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + args.push_back(CI->getArgOperand(i)); + } + F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); + DEBUG(errs() << *F << "\n"); + } + else { // Non-overloaded intrinsic + F = Intrinsic::getDeclaration(M, IntrinsicID); + FunctionType* FTy = F->getFunctionType(); + DEBUG(errs() << *F << "\n"); + + // Create argument list + assert(CI->getNumArgOperands() == FTy->getNumParams() + && "Number of arguments of call do not match with Intrinsic"); + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + Value* V = CI->getArgOperand(i); + // Either the type should match or both should be of pointer type + assert((V->getType() == FTy->getParamType(i) || + (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) + && "Dummy function call argument does not match with Intrinsic argument!"); + // If the types do not match, then both must be pointer type and pointer + // cast needs to be performed + if(V->getType() != FTy->getParamType(i)) { + V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); + } + args.push_back(V); + } + } + // Insert call instruction + CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + + DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); + + CI->replaceAllUsesWith(Inst); + // If the previous instruction needs to be erased, insert it in the vector + // Erased + if(Erase != NULL) + Erase->push_back(CI); +} + +IS_VISC_CALL(launch) /* Exists but not required */ +IS_VISC_CALL(edge) /* Exists but not required */ +IS_VISC_CALL(createNodeND) +//IS_VISC_CALL(createNode) +//IS_VISC_CALL(createNode1D) +//IS_VISC_CALL(createNode2D) +//IS_VISC_CALL(createNode3D) +IS_VISC_CALL(bindIn) +IS_VISC_CALL(bindOut) +IS_VISC_CALL(push) +IS_VISC_CALL(pop) +IS_VISC_CALL(getNode) +IS_VISC_CALL(getParentNode) +IS_VISC_CALL(barrier) +IS_VISC_CALL(malloc) +IS_VISC_CALL(return) +IS_VISC_CALL(getNodeInstanceID_x) +IS_VISC_CALL(getNodeInstanceID_y) +IS_VISC_CALL(getNodeInstanceID_z) +IS_VISC_CALL(getNumNodeInstances_x) +IS_VISC_CALL(getNumNodeInstances_y) +IS_VISC_CALL(getNumNodeInstances_z) +// Atomics +IS_VISC_CALL(atomic_cmpxchg) +IS_VISC_CALL(atomic_add) +IS_VISC_CALL(atomic_sub) +IS_VISC_CALL(atomic_xchg) +IS_VISC_CALL(atomic_inc) +IS_VISC_CALL(atomic_dec) +IS_VISC_CALL(atomic_min) +IS_VISC_CALL(atomic_max) +IS_VISC_CALL(atomic_umin) +IS_VISC_CALL(atomic_umax) +IS_VISC_CALL(atomic_and) +IS_VISC_CALL(atomic_or) +IS_VISC_CALL(atomic_xor) +// Misc Fn +IS_VISC_CALL(floor) +IS_VISC_CALL(rsqrt) +IS_VISC_CALL(sqrt) +IS_VISC_CALL(sin) +IS_VISC_CALL(cos) + + +IS_VISC_CALL(init) +IS_VISC_CALL(cleanup) +IS_VISC_CALL(wait) +IS_VISC_CALL(trackMemory) +IS_VISC_CALL(untrackMemory) +IS_VISC_CALL(requestMemory) +IS_VISC_CALL(attributes) +IS_VISC_CALL(hint) + +// Return the constant integer represented by value V +static unsigned getNumericValue(Value* V) { + assert(isa<ConstantInt>(V) + && "Value indicating the number of arguments should be a constant integer"); + return cast<ConstantInt>(V)->getZExtValue(); +} + +// Take the __visc__return instruction and generate code for combining the +// values being returned into a struct and returning it. +// The first operand is the number of returned values +static Value* genCodeForReturn(CallInst* CI) { + LLVMContext& Ctx = CI->getContext(); + assert(isVISCCall_return(CI) + && "__visc__return instruction expected!"); + + // Parse the dummy function call here + assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n"); + unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); + + assert(CI->getNumArgOperands()-1 == numRetVals && + "Too few arguments for __visc_return call!\n"); + DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); + + std::vector<Type*> ArgTypes; + for(unsigned i=1; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + } + Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); + StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); + + InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy), + CI->getArgOperand(1), + 0, + "returnStruct", + CI); + DEBUG(errs() << "Code generation for return:\n"); + DEBUG(errs() << *IV << "\n"); + + for(unsigned i=2; i < CI->getNumArgOperands(); i++) { + IV = InsertValueInst::Create(IV, + CI->getArgOperand(i), + i-1, + IV->getName(), + CI); + DEBUG(errs() << *IV << "\n"); + } + + return IV; +} + +// Analyse the attribute call for this function. Add the in and out +// attributes to pointer parameters. +static void handleVISCAttributes(Function* F, CallInst* CI) { + DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n"); + // Parse the dummy function call here + unsigned offset = 0; + // Find number of In pointers + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); + unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); + DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); + + for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::In); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + } + } + // Find number of Out Pointers + offset += 1 + numInPtrs; + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); + unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); + DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); + for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::Out); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + } + } + DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n"); +} + +// Public Functions of GenVISC pass +bool GenVISC::runOnModule(Module &M) { + errs() << "\nGENVISC PASS\n"; + this->M = &M; + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && + "Define LLVM_SRC_ROOT environment variable!"); + + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc"; + errs() << llvmSrcRoot << "\n"; + + std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + + if(runtimeModule == NULL) { + DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); + assert(false && "couldn't parse runtime"); + } + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet", + runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType()); + //DEBUG(errs() << *llvm_visc_initializeTimerSet); + + llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer", + runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); + // DEBUG(errs() << *llvm_visc_switchToTimer); + + llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet", + runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); + //DEBUG(errs() << *llvm_visc_printTimerSet); + + // Insert init context in main + DEBUG(errs() << "Locate __visc__init()\n"); + Function* VI = M.getFunction("__visc__init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + Instruction* I = cast<Instruction>(*VI->user_begin()); + + DEBUG(errs() << "Initialize Timer Set\n"); + initializeTimerSet(I); + switchToTimer(visc_TimerID_NONE, I); + + // Insert print instruction at visc exit + DEBUG(errs() << "Locate __visc__cleanup()\n"); + Function* VC = M.getFunction("__visc__cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + I = cast<Instruction>(*VC->user_begin()); + printTimerSet(I); + + + DEBUG(errs() << "-------- Searching for launch sites ----------\n"); + + std::vector<Instruction*> toBeErased; + std::vector<Function*> functions; + + for (auto &F : M) + functions.push_back(&F); + + // Iterate over all functions in the module + for (Function *f : functions) { + DEBUG(errs() << "Function: " << f->getName() << "\n"); + + // List with the required additions in the function's return type + std::vector<Type*> FRetTypes; + + enum mutateTypeCause { + mtc_None, + mtc_BIND, + mtc_RETURN, + mtc_NUM_CAUSES + } bind; + bind = mutateTypeCause::mtc_None; + + // Iterate over all the instructions in this function + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + // If not a call instruction, move to next instruction + if(!isa<CallInst>(I)) + continue; + + CallInst* CI = cast<CallInst>(I); + LLVMContext& Ctx = CI->getContext(); + + if(isVISCCall_init(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased); + } + if(isVISCCall_cleanup(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased); + } + if(isVISCCall_wait(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased); + } + if(isVISCCall_trackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased); + } + if(isVISCCall_untrackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased); + } + if(isVISCCall_requestMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased); + } + if(isVISCCall_hint(I)) { + assert(isa<ConstantInt>(CI->getArgOperand(0)) + && "Argument to hint must be constant integer!"); + ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); + + visc::Target t = (visc::Target) hint->getZExtValue(); + addHint(CI->getParent()->getParent(), t); + DEBUG(errs() << "Found visc hint call: " << *CI << "\n"); + toBeErased.push_back(CI); + } + if(isVISCCall_launch(I)) { + Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); + DEBUG(errs() << *LaunchF << "\n"); + // Get i8* cast to function pointer + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + assert(F && "Function invoked by VISC launch has to be define and constant."); + + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0)); + assert(Op && "VISC launch's streaming argument is a constant value."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + + auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); + assert(ArgTy && "VISC launch argument should be pointer type."); + Value *Arg = CI->getArgOperand(2); + if(!ArgTy->getElementType()->isIntegerTy(8)) + Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI); + Value* LaunchArgs[] = {F, Arg, isStreaming}; + CallInst* LaunchInst = CallInst::Create(LaunchF, + ArrayRef<Value*>(LaunchArgs, 3), + "graphID", CI); + DEBUG(errs() << "Found visc launch call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); + CI->replaceAllUsesWith(LaunchInst); + toBeErased.push_back(CI); + } + if(isVISCCall_push(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased); + } + if(isVISCCall_pop(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased); + } + if(isVISCCall_createNodeND(I)) { + assert(CI->getNumArgOperands() > 0 && + "Too few arguments for __visc__createNodeND call"); + unsigned numDims = getNumericValue(CI->getArgOperand(0)); + // We need as meny dimension argments are there are dimensions + assert(CI->getNumArgOperands()-2 == numDims && + "Too few arguments for __visc_createNodeND call!\n"); + + Function* CreateNodeF; + switch (numDims) { + case 0: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); + break; + case 1: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); + break; + case 2: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); + break; + case 3: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); + break; + default: + llvm_unreachable("Unsupported number of dimensions\n"); + break; + } + DEBUG(errs() << *CreateNodeF << "\n"); + DEBUG(errs() << *I << "\n"); + DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n"); + + // Get i8* cast to function pointer + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + + CallInst* CreateNodeInst; + switch (numDims) { + case 0: + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(F), + graphFunc->getName()+".node", CI); + break; + case 1: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 2), + graphFunc->getName()+".node", CI); + } + break; + case 2: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), + CI->getArgOperand(3)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 3), + graphFunc->getName()+".node", CI); + } + break; + case 3: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 4, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), + CI->getArgOperand(3), + CI->getArgOperand(4)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 4), + graphFunc->getName()+".node", CI); + } + break; + default: + llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n"); + break; + } + + DEBUG(errs() << "Found visc createNode call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n"); + CI->replaceAllUsesWith(CreateNodeInst); + toBeErased.push_back(CI); + } + + if(isVISCCall_edge(I)) { + Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + DEBUG(errs() << *EdgeF << "\n"); + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5)); + ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); + assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4), + isStreaming + }; + CallInst* EdgeInst = CallInst::Create(EdgeF, + ArrayRef<Value*>(EdgeArgs, 6), + "output", CI); + DEBUG(errs() << "Found visc edge call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); + CI->replaceAllUsesWith(EdgeInst); + toBeErased.push_back(CI); + } + if(isVISCCall_bindIn(I)) { + Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + DEBUG(errs() << *BindInF << "\n"); + // Check if this is a streaming bind or not + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind in intrinsic should be a constant integer."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindInInst = CallInst::Create(BindInF, + ArrayRef<Value*>(BindInArgs, 4), + "", CI); + DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); + CI->replaceAllUsesWith(BindInInst); + toBeErased.push_back(CI); + } + if(isVISCCall_bindOut(I)) { + Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + DEBUG(errs() << *BindOutF << "\n"); + // Check if this is a streaming bind or not + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind out intrinsic should be a constant integer."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindOutInst = CallInst::Create(BindOutF, + ArrayRef<Value*>(BindOutArgs, 4), + "", CI); + DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); + + DEBUG(errs() << "Fixing the return type of the function\n"); + // FIXME: What if the child node function has not been visited already. + // i.e., it's return type has not been fixed. + Function* F = I->getParent()->getParent(); + DEBUG(errs() << F->getName() << "\n";); + IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); + assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic."); + DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); + assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && + "Instruction value in bind out is not a create node intrinsic."); + Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); + DEBUG(errs() << ChildF->getName() << "\n";); + int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); + int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); + StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); + + Type* ReturnType = F->getReturnType(); + DEBUG(errs() << *ReturnType << "\n";); + assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) + && "Return type should either be a struct or void type!"); + + FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos)); + assert(((bind == mutateTypeCause::mtc_BIND) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and visc_return detected"); + bind = mutateTypeCause::mtc_BIND; + + CI->replaceAllUsesWith(BindOutInst); + toBeErased.push_back(CI); + } + if(isVISCCall_attributes(I)) { + Function* F = CI->getParent()->getParent(); + handleVISCAttributes(F, CI); + toBeErased.push_back(CI); + } + if (isVISCCall_getNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased); + } + if (isVISCCall_getParentNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased); + } + if (isVISCCall_barrier(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased); + } + if (isVISCCall_malloc(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased); + } + if (isVISCCall_return(I)) { + DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n"); + // The operands to this call are the values to be returned by the node + Value* ReturnVal = genCodeForReturn(CI); + DEBUG(errs() << *ReturnVal << "\n"); + Type* ReturnType = ReturnVal->getType(); + assert(isa<StructType>(ReturnType) + && "Return type should be a struct type!"); + + assert(((bind == mutateTypeCause::mtc_RETURN) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and visc_return detected"); + + if (bind == mutateTypeCause::mtc_None) { + // If this is None, this is the first __visc__return + // instruction we have come upon. Place the return type of the + // function in the return type vector + bind = mutateTypeCause::mtc_RETURN; + StructType* ReturnStructTy = cast<StructType>(ReturnType); + for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) + FRetTypes.push_back(ReturnStructTy->getElementType(i)); + } else { // bind == mutateTypeCause::mtc_RETURN + // This is not the first __visc__return + // instruction we have come upon. + // Check that the return types are the same + assert((ReturnType == FRetTypes[0]) + && "Multiple returns with mismatching types"); + } + + ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal); + DEBUG(errs() << "Found visc return call: " << *CI << "\n"); + Instruction* oldReturn = CI->getParent()->getTerminator(); + assert(isa<ReturnInst>(oldReturn) + && "Expecting a return to be the terminator of this BB!"); + DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); + DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); + //CI->replaceAllUsesWith(RetInst); + toBeErased.push_back(CI); + ReplaceInstWithInst(oldReturn, RetInst); + DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n"); + } + + if (isVISCCall_getNodeInstanceID_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased); + } + if (isVISCCall_getNodeInstanceID_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased); + } + if (isVISCCall_getNodeInstanceID_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased); + } + if (isVISCCall_atomic_add(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased); + } + if (isVISCCall_atomic_sub(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased); + } + if (isVISCCall_atomic_xchg(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased); + } + if (isVISCCall_atomic_min(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased); + } + if (isVISCCall_atomic_max(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased); + } + if (isVISCCall_atomic_and(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased); + } + if (isVISCCall_atomic_or(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased); + } + if (isVISCCall_atomic_xor(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased); + } + if (isVISCCall_sin(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); + } + if (isVISCCall_cos(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased); + } + } + + // Erase the __visc__node calls + DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); + for(auto I: toBeErased) { + DEBUG(errs() << *I << "\n"); + } + while(!toBeErased.empty()) { + Instruction* I = toBeErased.back(); + DEBUG(errs() << "\tErasing " << *I << "\n"); + I->eraseFromParent(); + toBeErased.pop_back(); + } + + if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) { + DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); + // Argument type list. + std::vector<Type*> FArgTypes; + for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); + ai != ae; ++ai) { + FArgTypes.push_back(ai->getType()); + } + + // Find new return type of function + Type* NewReturnTy; + if(bind == mutateTypeCause::mtc_BIND) { + + std::vector<Type*> TyList; + for (unsigned i = 0; i < FRetTypes.size(); i++) + TyList.push_back(FRetTypes[i]); + + NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true); + } + else { + NewReturnTy = getReturnTypeFromReturnInst(f); + assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); + } + + FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); + + // Change the function type + Function* newF = cloneFunction(f, FTy, false); + DEBUG(errs() << *newF << "\n"); + + if (bind == mutateTypeCause::mtc_BIND) { + // This is certainly an internal node, and hence just one BB with one + // return terminator instruction. Change return statement + ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator()); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy)); + ReplaceInstWithInst(RI, newRI); + } + if (bind == mutateTypeCause::mtc_RETURN) { + // Nothing + } + replaceNodeFunctionInIR(*f->getParent(), f, newF); + DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); + } + + + } + return false; //TODO: What does returning "false" mean? +} + +// Generate Code for declaring a constant string [L x i8] and return a pointer +// to the start of it. +Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { + Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true); + Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); + Value* GEPArgs[] = {Zero, Zero}; + GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, + ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); + return SPtr; +} + +void GenVISC::initializeTimerSet(Instruction* InsertBefore) { + Value* TimerSetAddr; + StoreInst* SI; + TIMER(TimerSet = new GlobalVariable(*M, + Type::getInt8PtrTy(M->getContext()), + false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), + "viscTimerSet_GenVISC")); + DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n"); + //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); + + TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, + None, + "", + InsertBefore)); + DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); + TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); + DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); +} + +void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { + Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)}; + TIMER(CallInst::Create(llvm_visc_switchToTimer, + ArrayRef<Value*>(switchArgs, 2), + "", + InsertBefore)); +} + +void GenVISC::printTimerSet(Instruction* InsertBefore) { + Value* TimerName; + TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore)); + Value* printArgs[] = {TimerSet, TimerName}; + TIMER(CallInst::Create(llvm_visc_printTimerSet, + ArrayRef<Value*>(printArgs, 2), + "", + InsertBefore)); +} + +static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { + return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); +} + +static Function* transformReturnTypeToStruct(Function* F) { + // Currently only works for void return types + DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n"); + + if (isa<StructType>(F->getReturnType())) { + DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n"); + return F; + } + + assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n"); + + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true); + FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); + + SmallVector<ReturnInst*, 8> Returns; + Function* newF = cloneFunction(F, FTy, false, &Returns); + // Replace ret void instruction with ret %RetTy undef + for(auto &RI: Returns) { + DEBUG(errs() << "Found return inst: "<< *RI << "\n"); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); + ReplaceInstWithInst(RI, newRI); + } + + replaceNodeFunctionInIR(*F->getParent(), F, newF); + return newF; +} + +static Type* getReturnTypeFromReturnInst(Function* F) { + for(BasicBlock &BB: *F) { + if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) { + DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n"); + return RI->getReturnValue()->getType(); + } + } +} + + +char genvisc::GenVISC::ID = 0; +static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false); + +} // End of namespace genvisc + + diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.exports b/hpvm/lib/Transforms/GenVISC/GenVISC.exports similarity index 100% rename from hpvm/lib/Transforms/GenHPVM/GenHPVM.exports rename to hpvm/lib/Transforms/GenVISC/GenVISC.exports diff --git a/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt b/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt similarity index 88% rename from hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt rename to hpvm/lib/Transforms/GenVISC/LLVMBuild.txt index 94ef73ac07..9266b2c597 100644 --- a/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt +++ b/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Transforms/GenHPVM/LLVMBuild.txt -------------------*- Conf -*--===; +;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,5 +17,5 @@ [component_0] type = Library -name = GenHPVM +name = GenVISC parent = Transforms diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp index fc33ebee71..7bd66b62c6 100644 --- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp +++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "LocalMem" -#include "SupportHPVM/DFG2LLVM.h" +#include "SupportVISC/DFG2LLVM.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" @@ -134,7 +134,7 @@ void AT_OCL::codeGen(DFLeafNode *N) { // Return pointer to property if this leaf node matches the conditions for being // an allocation node. Conditions // 1. No incoming memory pointer. No in/out attribute on a pointer argument -// 2. Uses hpvm malloc intrinsic to allocate memory +// 2. Uses visc malloc intrinsic to allocate memory // 3. Sends it out // 2. (TODO:) Whether the allocated pointer escapes the parent node AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { @@ -148,18 +148,18 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { Function *F = N->getFuncPointer(); - // Allocation node must use hpvm malloc intrinsic - bool usesHPVMMalloc = false; + // Allocation node must use visc malloc intrinsic + bool usesVISCMalloc = false; for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) { Instruction *I = &*i; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) { - usesHPVMMalloc = true; + if (II->getIntrinsicID() == Intrinsic::visc_malloc) { + usesVISCMalloc = true; break; } } } - if (!usesHPVMMalloc) + if (!usesVISCMalloc) return NULL; // TODO: Check if allocated pointer leaves parent node @@ -197,20 +197,20 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) { assert(OutValues[i]->getType()->isPointerTy() && "Expected outgoing edge to be of pointer type"); if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) { - if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) { + if (II->getIntrinsicID() == Intrinsic::visc_malloc) { // Sanity check: Size passed to malloc intrinsic is same as the value // going into the next outgoing edge - DEBUG(errs() << "HPVM malloc size: " << *II->getArgOperand(0) << "\n"); + DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n"); assert(II->getArgOperand(0) == OutValues[i + 1] && - "Sanity Check Failed: HPVM Malloc size argument != next " + "Sanity Check Failed: VISC Malloc size argument != next " "outgoing edge"); ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0)); i = i + 2; continue; } } - llvm_unreachable("Expecting hpvm malloc intrinsic instruction!"); + llvm_unreachable("Expecting visc malloc intrinsic instruction!"); } return ANP; } diff --git a/hpvm/llvm_installer/llvm_installer.sh b/hpvm/llvm_installer/llvm_installer.sh index e072d042b7..d7fcda4ac4 100755 --- a/hpvm/llvm_installer/llvm_installer.sh +++ b/hpvm/llvm_installer/llvm_installer.sh @@ -179,10 +179,10 @@ echo make -j$NUM_THREADS make -j$NUM_THREADS #make install -# echo Building HPVM runtime -# HPVM_RT_DIR=$HPVM_DIR/projects/hpvm-rt -# cd $HPVM_RT_DIR -# make +#echo Building HPVM runtime +#HPVM_RT_DIR=$HPVM_DIR/projects/visc-rt +#cd $HPVM_RT_DIR +#make #cp -r $CURRENT_DIR/projects $HPVM_DIR/ #make -j$NUM_THREADS diff --git a/hpvm/llvm_patches/apply_patch.sh b/hpvm/llvm_patches/apply_patch.sh index 289e5c11e3..ea86575207 100644 --- a/hpvm/llvm_patches/apply_patch.sh +++ b/hpvm/llvm_patches/apply_patch.sh @@ -1,7 +1,7 @@ #!/bin/sh ### File Copies -cp include/IR/IntrinsicsHPVM.td ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsHPVM.td +cp include/IR/IntrinsicsVISC.td ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsVISC.td ## Header File Patches diff --git a/hpvm/llvm_patches/include/IR/Attributes.td b/hpvm/llvm_patches/include/IR/Attributes.td index c6ff8ef3c6..b644cdb30b 100644 --- a/hpvm/llvm_patches/include/IR/Attributes.td +++ b/hpvm/llvm_patches/include/IR/Attributes.td @@ -151,7 +151,7 @@ def ShadowCallStack : EnumAttr<"shadowcallstack">; /// Sign extended before/after call. def SExt : EnumAttr<"signext">; -/// HPVM Attributes +/// VISC Attributes /// Pointer to read only memory def In : EnumAttr<"in">; diff --git a/hpvm/llvm_patches/include/IR/Intrinsics.td b/hpvm/llvm_patches/include/IR/Intrinsics.td index 2e3f34eb1a..2f79964a2e 100644 --- a/hpvm/llvm_patches/include/IR/Intrinsics.td +++ b/hpvm/llvm_patches/include/IR/Intrinsics.td @@ -1249,4 +1249,4 @@ include "llvm/IR/IntrinsicsBPF.td" include "llvm/IR/IntrinsicsSystemZ.td" include "llvm/IR/IntrinsicsWebAssembly.td" include "llvm/IR/IntrinsicsRISCV.td" -include "llvm/IR/IntrinsicsHPVM.td" +include "llvm/IR/IntrinsicsVISC.td" diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td deleted file mode 100644 index 410e9c8d33..0000000000 --- a/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td +++ /dev/null @@ -1,208 +0,0 @@ -//===- IntrinsicsHPVM.td - Defines HPVM intrinsics ---------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines all of the HPVM-specific intrinsics. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "hpvm" in { - /* All intrinsics start with "llvm.hpvm." - * As we do not want the compiler to mess with these intrinsics, we assume - * worst memory behavior for all these intrinsics. - */ - - /* Initialization intrinsic - - * i8* llvm.hpvm.setup(function*); - */ - def int_hpvm_init : Intrinsic<[], [], []>; - - /* Launch intrinsic - with streaming argument - * i8* llvm.hpvm.launch(i8*, ArgList*, i1); - */ - def int_hpvm_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_ptr_ty, llvm_i1_ty], []>; - - /* Push intrinsic - push data on streaming pipeline - * void llvm.hpvm.push(i8*, ArgList*); - */ - def int_hpvm_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>; - - /* Pop intrinsic - pop data from streaming pipeline - * i8* llvm.hpvm.pop(i8*); - */ - def int_hpvm_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; - - /* Cleanup intrinsic - - * void llvm.hpvm.cleanup(i8*); - */ - def int_hpvm_cleanup : Intrinsic<[], [], []>; - - /* Wait intrinsic - - * void llvm.hpvm.wait(graphID*); - */ - def int_hpvm_wait : Intrinsic<[], [llvm_ptr_ty], []>; - - /* Track memory intrinsic - - * void llvm.hpvm.trackMemory(i8*, i64); - */ - def int_hpvm_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; - - /* Track memory intrinsic - - * void llvm.hpvm.untrackMemory(i8*); - */ - def int_hpvm_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>; - - /* Request memory intrinsic - - * void llvm.hpvm.requestMemory(i8*, i64); - */ - def int_hpvm_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; - - /* Create Node intrinsic - - * i8* llvm.hpvm.createNode(function*); - */ - def int_hpvm_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; - - /* Create Node 1D array intrinsic - - * i8* llvm.hpvm.createNode1D(function*, i64); - */ - def int_hpvm_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty], []>; - - /* Create Node 2D array intrinsic - - * i8* llvm.hpvm.createNode2D(function*, i64, i64); - */ - def int_hpvm_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty, llvm_i64_ty], []>; - - /* Create Node 3D array intrinsic - - * i8* llvm.hpvm.createNode2D(function*, i64, i64, i64); - */ - def int_hpvm_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, - llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], - []>; - - /* Create dataflow edge intrinsic - - * i8* llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1); - */ - def int_hpvm_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty, - llvm_i1_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i1_ty], - []>; - - /* Create bind input intrinsic - - * void llvm.hpvm.bind.input(i8*, i32, i32); - */ - def int_hpvm_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i1_ty], []>; - - /* Create bind output intrinsic - - * void llvm.hpvm.bind.output(i8*, i32, i32); - */ - def int_hpvm_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i1_ty], []>; - - /* Find associated dataflow node intrinsic - - * i8* llvm.hpvm.getNode(); - */ - def int_hpvm_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; - - /* Find parent dataflow node intrinsic - - * i8* llvm.hpvm.getParentNode(i8*); - */ - def int_hpvm_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; - - /* Find the number of dimensions of a dataflow node intrinsic - - * i32 llvm.hpvm.getNumDims(i8*); - */ - def int_hpvm_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; - - /* Find the unique indentifier of a dataflow node (with respect to his parent - * node) in the specified dimension intrinsic - - */ - - /* i64 llvm.hpvm.getNodeInstanceID.[xyz](i8*); - */ - def int_hpvm_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_hpvm_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_hpvm_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - /* Find the number of instances of a dataflow node in the specified dimension - * intrinsic - - */ - - /* i64 llvm.hpvm.getNumNodeInstances.[xyz](i8*); - */ - def int_hpvm_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_hpvm_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - def int_hpvm_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], - [IntrNoMem]>; - - /* Local Barrier - * void llvm.hpvm.barrier(); - */ - def int_hpvm_barrier : Intrinsic<[], [], []>; - - /* Memory allocation inside the graph - * i8* llvm.hpvm.malloc(); - */ - def int_hpvm_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>; - - /* Find the vector length supported by target architecture - * intrinsic - - * i32 llvm.hpvm.getVectorLength(); - */ - def int_hpvm_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>; - - /* ============ Atomic intrinsics ============= */ - // Atomic arithmetic operations - - /* i32 llvm.hpvm.atomic.add(i32*, i32)*/ - def int_hpvm_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.sub(i32*, i32)*/ - def int_hpvm_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.xchg(i32*, i32)*/ - def int_hpvm_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.min(i32*, i32)*/ - def int_hpvm_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.maxi32*, i32)*/ - def int_hpvm_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - // Atomic bitwise operations - - /* i32 llvm.hpvm.atomic.and(i32*, i32)*/ - def int_hpvm_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.or(i32*, i32)*/ - def int_hpvm_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - - /* i32 llvm.hpvm.atomic.xor(i32*, i32)*/ - def int_hpvm_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - []>; - -} diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td b/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td new file mode 100644 index 0000000000..d5330175d8 --- /dev/null +++ b/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td @@ -0,0 +1,208 @@ +//===- IntrinsicsVISC.td - Defines VISC intrinsics ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the VISC-specific intrinsics. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "visc" in { + /* All intrinsics start with "llvm.visc." + * As we do not want the compiler to mess with these intrinsics, we assume + * worst memory behavior for all these intrinsics. + */ + + /* Initialization intrinsic - + * i8* llvm.visc.setup(function*); + */ + def int_visc_init : Intrinsic<[], [], []>; + + /* Launch intrinsic - with streaming argument + * i8* llvm.visc.launch(i8*, ArgList*, i1); + */ + def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_ptr_ty, llvm_i1_ty], []>; + + /* Push intrinsic - push data on streaming pipeline + * void llvm.visc.push(i8*, ArgList*); + */ + def int_visc_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>; + + /* Pop intrinsic - pop data from streaming pipeline + * i8* llvm.visc.pop(i8*); + */ + def int_visc_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; + + /* Cleanup intrinsic - + * void llvm.visc.cleanup(i8*); + */ + def int_visc_cleanup : Intrinsic<[], [], []>; + + /* Wait intrinsic - + * void llvm.visc.wait(graphID*); + */ + def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>; + + /* Track memory intrinsic - + * void llvm.visc.trackMemory(i8*, i64); + */ + def int_visc_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; + + /* Track memory intrinsic - + * void llvm.visc.untrackMemory(i8*); + */ + def int_visc_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>; + + /* Request memory intrinsic - + * void llvm.visc.requestMemory(i8*, i64); + */ + def int_visc_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; + + /* Create Node intrinsic - + * i8* llvm.visc.createNode(function*); + */ + def int_visc_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>; + + /* Create Node 1D array intrinsic - + * i8* llvm.visc.createNode1D(function*, i64); + */ + def int_visc_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty], []>; + + /* Create Node 2D array intrinsic - + * i8* llvm.visc.createNode2D(function*, i64, i64); + */ + def int_visc_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty, llvm_i64_ty], []>; + + /* Create Node 3D array intrinsic - + * i8* llvm.visc.createNode2D(function*, i64, i64, i64); + */ + def int_visc_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, + llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], + []>; + + /* Create dataflow edge intrinsic - + * i8* llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1); + */ + def int_visc_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty, + llvm_i1_ty, llvm_i32_ty, llvm_i32_ty, + llvm_i1_ty], + []>; + + /* Create bind input intrinsic - + * void llvm.visc.bind.input(i8*, i32, i32); + */ + def int_visc_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty], []>; + + /* Create bind output intrinsic - + * void llvm.visc.bind.output(i8*, i32, i32); + */ + def int_visc_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty], []>; + + /* Find associated dataflow node intrinsic - + * i8* llvm.visc.getNode(); + */ + def int_visc_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; + + /* Find parent dataflow node intrinsic - + * i8* llvm.visc.getParentNode(i8*); + */ + def int_visc_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; + + /* Find the number of dimensions of a dataflow node intrinsic - + * i32 llvm.visc.getNumDims(i8*); + */ + def int_visc_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; + + /* Find the unique indentifier of a dataflow node (with respect to his parent + * node) in the specified dimension intrinsic - + */ + + /* i64 llvm.visc.getNodeInstanceID.[xyz](i8*); + */ + def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + /* Find the number of instances of a dataflow node in the specified dimension + * intrinsic - + */ + + /* i64 llvm.visc.getNumNodeInstances.[xyz](i8*); + */ + def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], + [IntrNoMem]>; + + /* Local Barrier + * void llvm.visc.barrier(); + */ + def int_visc_barrier : Intrinsic<[], [], []>; + + /* Memory allocation inside the graph + * i8* llvm.visc.malloc(); + */ + def int_visc_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>; + + /* Find the vector length supported by target architecture + * intrinsic - + * i32 llvm.visc.getVectorLength(); + */ + def int_visc_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>; + + /* ============ Atomic intrinsics ============= */ + // Atomic arithmetic operations + + /* i32 llvm.visc.atomic.add(i32*, i32)*/ + def int_visc_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.sub(i32*, i32)*/ + def int_visc_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.xchg(i32*, i32)*/ + def int_visc_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.min(i32*, i32)*/ + def int_visc_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.maxi32*, i32)*/ + def int_visc_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + // Atomic bitwise operations + + /* i32 llvm.visc.atomic.and(i32*, i32)*/ + def int_visc_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.or(i32*, i32)*/ + def int_visc_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + + /* i32 llvm.visc.atomic.xor(i32*, i32)*/ + def int_visc_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], + []>; + +} diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp index 2c54392f80..a924405a2c 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp @@ -855,7 +855,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(bit); KEYWORD(varFlags); - // HPVM parameter attributes + // VISC parameter attributes KEYWORD(in); KEYWORD(out); KEYWORD(inout); diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp index 7446ff1e32..f5ce44e2a9 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp +++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp @@ -1470,7 +1470,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_swiftself: case lltok::kw_immarg: - // HPVM Parameter only attributes + // VISC Parameter only attributes case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: @@ -1808,7 +1808,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { B.addAttribute(Attribute::ImmArg); break; - // HPVM parameter attributes + // VISC parameter attributes case lltok::kw_in: B.addAttribute(Attribute::In); break; @@ -1927,7 +1927,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_swiftself: case lltok::kw_immarg: - // HPVM Parameter only attributes + // VISC Parameter only attributes case lltok::kw_in: case lltok::kw_out: case lltok::kw_inout: diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h index cb0479b41c..7f9816965b 100644 --- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h +++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h @@ -351,7 +351,7 @@ enum Kind { kw_insertvalue, kw_blockaddress, - // HPVM parameter attributes + // VISC parameter attributes kw_in, kw_out, kw_inout, diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp index a1e6447285..7eb289d587 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1395,7 +1395,7 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) { case Attribute::NoFree: return 1ULL << 63; - // HPVM Attributes + // VISC Attributes case Attribute::In: return 3ULL << 0; case Attribute::Out: diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp index fd671c3975..55e7415efb 100644 --- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -773,7 +773,7 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { case Attribute::SanitizeMemTag: return bitc::ATTR_KIND_SANITIZE_MEMTAG; - // HPVM Attributes + // VISC Attributes case Attribute::In: return bitc::ATTR_KIND_IN; case Attribute::Out: diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp index 29c47a9e11..3cc95b3102 100644 --- a/hpvm/llvm_patches/lib/IR/Attributes.cpp +++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp @@ -404,7 +404,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const { if (hasAttribute(Attribute::ImmArg)) return "immarg"; - // HPVM attributes for arguments + // VISC attributes for arguments if (hasAttribute(Attribute::In)) return "in"; if (hasAttribute(Attribute::Out)) diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt deleted file mode 100644 index be7f69c4bf..0000000000 --- a/hpvm/projects/hpvm-rt/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -add_definitions(-DNUM_CORES=8) - -SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang) -SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++) - -add_llvm_library(hpvm-rt.ll hpvm-rt.cpp - - DEPENDS - clang - llvm-dis - ) - - -target_compile_options(hpvm-rt.ll PUBLIC -flto ) -target_compile_options(hpvm-rt.ll PUBLIC -std=c++11) - -add_custom_target(hpvm-rt.cpp.o ALL - COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a - COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc - COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-dis ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc) - -add_dependencies(hpvm-rt.cpp.o hpvm-rt.ll) diff --git a/hpvm/projects/visc-rt/CMakeLists.txt b/hpvm/projects/visc-rt/CMakeLists.txt new file mode 100644 index 0000000000..5b9449bf2d --- /dev/null +++ b/hpvm/projects/visc-rt/CMakeLists.txt @@ -0,0 +1,22 @@ +add_definitions(-DNUM_CORES=8) + +SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang) +SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++) + +add_llvm_library(visc-rt.ll visc-rt.cpp + + DEPENDS + clang + llvm-dis + ) + + +target_compile_options(visc-rt.ll PUBLIC -flto ) +target_compile_options(visc-rt.ll PUBLIC -std=c++11) + +add_custom_target(visc-rt.cpp.o ALL + COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libvisc-rt.ll.a + COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc + COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-dis ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc) + +add_dependencies(visc-rt.cpp.o visc-rt.ll) diff --git a/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt similarity index 100% rename from hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt rename to hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt diff --git a/hpvm/projects/hpvm-rt/device_abstraction.h b/hpvm/projects/visc-rt/device_abstraction.h similarity index 96% rename from hpvm/projects/hpvm-rt/device_abstraction.h rename to hpvm/projects/visc-rt/device_abstraction.h index 4948502ce8..7e77d100de 100644 --- a/hpvm/projects/hpvm-rt/device_abstraction.h +++ b/hpvm/projects/visc-rt/device_abstraction.h @@ -27,7 +27,7 @@ void initializeDeviceStatusIntervals() { unsigned sz = 0; unsigned tmp = 0; - const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/hpvm-rt/" + const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/" "deviceStatusSwitchIntervals.txt"; std::ifstream infile; infile.open(fn); diff --git a/hpvm/projects/hpvm-rt/makefile b/hpvm/projects/visc-rt/makefile similarity index 97% rename from hpvm/projects/hpvm-rt/makefile rename to hpvm/projects/visc-rt/makefile index 927e26e254..adcc632335 100644 --- a/hpvm/projects/hpvm-rt/makefile +++ b/hpvm/projects/visc-rt/makefile @@ -9,7 +9,7 @@ ifeq ($(NUM_CORES),) endif CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS -TARGET:=hpvm-rt +TARGET:=visc-rt LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++ diff --git a/hpvm/projects/hpvm-rt/policy.h b/hpvm/projects/visc-rt/policy.h similarity index 100% rename from hpvm/projects/hpvm-rt/policy.h rename to hpvm/projects/visc-rt/policy.h diff --git a/hpvm/projects/hpvm-rt/hpvm-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp similarity index 82% rename from hpvm/projects/hpvm-rt/hpvm-rt.cpp rename to hpvm/projects/visc-rt/visc-rt.cpp index ec2534cf43..53d3b516f2 100644 --- a/hpvm/projects/hpvm-rt/hpvm-rt.cpp +++ b/hpvm/projects/visc-rt/visc-rt.cpp @@ -13,7 +13,7 @@ #if _POSIX_VERSION >= 200112L #include <sys/time.h> #endif -#include "hpvm-rt.h" +#include "visc-rt.h" #ifndef DEBUG_BUILD #define DEBUG(s) \ @@ -59,7 +59,7 @@ vector<DFGDepth> DStack; pthread_mutex_t ocl_mtx; #define NUM_TESTS 1 -hpvm_TimerSet kernel_timer; +visc_TimerSet kernel_timer; static inline void checkErr(cl_int err, cl_int success, const char *name) { if (err != success) { @@ -70,7 +70,7 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) { } /************************* Policies *************************************/ -void llvm_hpvm_policy_init() { +void llvm_visc_policy_init() { cout << "Initializing policy object ...\n"; // policy = new NodePolicy(); // policy = new IterationPolicy(); @@ -80,19 +80,19 @@ void llvm_hpvm_policy_init() { cout << "DONE: Initializing policy object.\n"; } -void llvm_hpvm_policy_clear() { +void llvm_visc_policy_clear() { if (policy) free(policy); } -int llvm_hpvm_policy_getVersion(const char *name, int64_t i) { +int llvm_visc_policy_getVersion(const char *name, int64_t i) { return policy->getVersion(name, i); } /******************** Device Abstraction ********************************/ std::thread deviceStatusThread; -void llvm_hpvm_deviceAbstraction_start() { +void llvm_visc_deviceAbstraction_start() { cout << "Starting device status simulation ...\n"; // Initialize vector with points where ti switch device status initializeDeviceStatusIntervals(); @@ -102,7 +102,7 @@ void llvm_hpvm_deviceAbstraction_start() { return; } -void llvm_hpvm_deviceAbstraction_end() { +void llvm_visc_deviceAbstraction_end() { cout << "Ending device status simulation thread ...\n"; // Set the variable that allows the thread to know that execution has ended executionEnd = true; @@ -112,7 +112,7 @@ void llvm_hpvm_deviceAbstraction_end() { return; } -void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus() { +void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { while (!deviceStatus) { }; return; @@ -120,7 +120,7 @@ void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus() { /************************* Depth Stack Routines ***************************/ -void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, +void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY, uint64_t iY, uint64_t limitZ, uint64_t iZ) { DEBUG(cout << "Pushing node information on stack:\n"); @@ -134,7 +134,7 @@ void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, pthread_mutex_unlock(&ocl_mtx); } -void llvm_hpvm_x86_dstack_pop() { +void llvm_visc_x86_dstack_pop() { DEBUG(cout << "Popping from depth stack\n"); pthread_mutex_lock(&ocl_mtx); DStack.pop_back(); @@ -142,7 +142,7 @@ void llvm_hpvm_x86_dstack_pop() { pthread_mutex_unlock(&ocl_mtx); } -uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim) { +uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level << flush << "\n"); pthread_mutex_lock(&ocl_mtx); @@ -154,7 +154,7 @@ uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim) { return result; } -uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim) { +uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level << flush << "\n"); pthread_mutex_lock(&ocl_mtx); @@ -168,7 +168,7 @@ uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim) { /********************** Memory Tracking Routines **************************/ -void llvm_hpvm_track_mem(void *ptr, size_t size) { +void llvm_visc_track_mem(void *ptr, size_t size) { DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE != NULL) { @@ -180,7 +180,7 @@ void llvm_hpvm_track_mem(void *ptr, size_t size) { DEBUG(MTracker.print()); } -void llvm_hpvm_untrack_mem(void *ptr) { +void llvm_visc_untrack_mem(void *ptr) { DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE == NULL) { @@ -195,7 +195,7 @@ void llvm_hpvm_untrack_mem(void *ptr) { DEBUG(MTracker.print()); } -static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, +static void *llvm_visc_ocl_request_mem(void *ptr, size_t size, DFNodeContext_OCL *Context, bool isInput, bool isOutput) { pthread_mutex_lock(&ocl_mtx); @@ -233,7 +233,7 @@ static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, else clFlags = CL_MEM_READ_ONLY; - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); // pthread_mutex_lock(&ocl_mtx); cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); @@ -249,7 +249,7 @@ static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); } - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); DEBUG(cout << " done\n"); MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context); DEBUG(cout << "Updated Table\n"); @@ -258,11 +258,11 @@ static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, return d_input; } -void *llvm_hpvm_x86_argument_ptr(void *ptr, size_t size) { - return llvm_hpvm_request_mem(ptr, size); +void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) { + return llvm_visc_request_mem(ptr, size); } -void *llvm_hpvm_request_mem(void *ptr, size_t size) { +void *llvm_visc_request_mem(void *ptr, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); @@ -283,13 +283,13 @@ void *llvm_hpvm_request_mem(void *ptr, size_t size) { DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); DEBUG(cout << "\tCopying ..."); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); // pthread_mutex_lock(&ocl_mtx); cl_int errcode = clEnqueueReadBuffer( ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue, (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL); // pthread_mutex_unlock(&ocl_mtx); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); DEBUG(cout << " done\n"); checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); DEBUG(cout << "Free mem object on device\n"); @@ -303,25 +303,25 @@ void *llvm_hpvm_request_mem(void *ptr, size_t size) { /*************************** Timer Routines **********************************/ -static int is_async(enum hpvm_TimerID timer) { - return (timer == hpvm_TimerID_KERNEL) || (timer == hpvm_TimerID_COPY_ASYNC); +static int is_async(enum visc_TimerID timer) { + return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC); } -static int is_blocking(enum hpvm_TimerID timer) { - return (timer == hpvm_TimerID_COPY) || (timer == hpvm_TimerID_NONE); +static int is_blocking(enum visc_TimerID timer) { + return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE); } -#define INVALID_TIMERID hpvm_TimerID_LAST +#define INVALID_TIMERID visc_TimerID_LAST -static int asyncs_outstanding(struct hpvm_TimerSet *timers) { +static int asyncs_outstanding(struct visc_TimerSet *timers) { return (timers->async_markers != NULL) && (timers->async_markers->timerID != INVALID_TIMERID); } -static struct hpvm_async_time_marker_list * -get_last_async(struct hpvm_TimerSet *timers) { +static struct visc_async_time_marker_list * +get_last_async(struct visc_TimerSet *timers) { /* Find the last event recorded thus far */ - struct hpvm_async_time_marker_list *last_event = timers->async_markers; + struct visc_async_time_marker_list *last_event = timers->async_markers; if (last_event != NULL && last_event->timerID != INVALID_TIMERID) { while (last_event->next != NULL && last_event->next->timerID != INVALID_TIMERID) @@ -331,17 +331,17 @@ get_last_async(struct hpvm_TimerSet *timers) { return NULL; } -static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) { +static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); + struct visc_async_time_marker_list **new_event = &(tset->async_markers); while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } if (*new_event == NULL) { - *new_event = (struct hpvm_async_time_marker_list *)malloc( - sizeof(struct hpvm_async_time_marker_list)); + *new_event = (struct visc_async_time_marker_list *)malloc( + sizeof(struct visc_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* // I don't think this is needed at all. I believe clEnqueueMarker 'creates' @@ -372,18 +372,18 @@ Event Status!\n"); } } -static void insert_submarker(struct hpvm_TimerSet *tset, char *label, - enum hpvm_TimerID timer) { +static void insert_submarker(struct visc_TimerSet *tset, char *label, + enum visc_TimerID timer) { cl_int ciErrNum = CL_SUCCESS; - struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); + struct visc_async_time_marker_list **new_event = &(tset->async_markers); while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { new_event = &((*new_event)->next); } if (*new_event == NULL) { - *new_event = (struct hpvm_async_time_marker_list *)malloc( - sizeof(struct hpvm_async_time_marker_list)); + *new_event = (struct visc_async_time_marker_list *)malloc( + sizeof(struct visc_async_time_marker_list)); (*new_event)->marker = calloc(1, sizeof(cl_event)); /* #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) @@ -414,10 +414,10 @@ Event Status!\n"); } /* Assumes that all recorded events have completed */ -static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) { - struct hpvm_async_time_marker_list *next_interval = NULL; - struct hpvm_async_time_marker_list *last_marker = get_last_async(tset); - hpvm_Timestamp total_async_time = 0; +static visc_Timestamp record_async_times(struct visc_TimerSet *tset) { + struct visc_async_time_marker_list *next_interval = NULL; + struct visc_async_time_marker_list *last_marker = get_last_async(tset); + visc_Timestamp total_async_time = 0; for (next_interval = tset->async_markers; next_interval != last_marker; next_interval = next_interval->next) { @@ -439,11 +439,11 @@ static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) { ciErrNum); } - hpvm_Timestamp interval = - (hpvm_Timestamp)(((double)(command_end - command_start))); + visc_Timestamp interval = + (visc_Timestamp)(((double)(command_end - command_start))); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { - struct hpvm_SubTimer *subtimer = + struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { if (strcmp(subtimer->label, next_interval->label) == 0) { @@ -463,8 +463,8 @@ static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) { return total_async_time; } -static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start, - hpvm_Timestamp end) { +static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start, + visc_Timestamp end) { #if _POSIX_VERSION >= 200112L *accum += end - start; #else @@ -473,33 +473,33 @@ static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start, } #if _POSIX_VERSION >= 200112L -static hpvm_Timestamp get_time() { +static visc_Timestamp get_time() { struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); - return (hpvm_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); + return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec); } #else #error "no supported time libraries are available on this platform" #endif -void hpvm_ResetTimer(struct hpvm_Timer *timer) { - timer->state = hpvm_Timer_STOPPED; +void visc_ResetTimer(struct visc_Timer *timer) { + timer->state = visc_Timer_STOPPED; #if _POSIX_VERSION >= 200112L timer->elapsed = 0; #else -#error "hpvm_ResetTimer: not implemented for this system" +#error "visc_ResetTimer: not implemented for this system" #endif } -void hpvm_StartTimer(struct hpvm_Timer *timer) { - if (timer->state != hpvm_Timer_STOPPED) { +void visc_StartTimer(struct visc_Timer *timer) { + if (timer->state != visc_Timer_STOPPED) { // FIXME: Removing warning statement to avoid printing this error // fputs("Ignoring attempt to start a running timer\n", stderr); return; } - timer->state = hpvm_Timer_RUNNING; + timer->state = visc_Timer_RUNNING; #if _POSIX_VERSION >= 200112L { @@ -508,19 +508,19 @@ void hpvm_StartTimer(struct hpvm_Timer *timer) { timer->init = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "hpvm_StartTimer: not implemented for this system" +#error "visc_StartTimer: not implemented for this system" #endif } -void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer, - struct hpvm_Timer *subtimer) { +void visc_StartTimerAndSubTimer(struct visc_Timer *timer, + struct visc_Timer *subtimer) { unsigned int numNotStopped = 0x3; // 11 - if (timer->state != hpvm_Timer_STOPPED) { + if (timer->state != visc_Timer_STOPPED) { fputs("Warning: Timer was not stopped\n", stderr); numNotStopped &= 0x1; // Zero out 2^1 } - if (subtimer->state != hpvm_Timer_STOPPED) { + if (subtimer->state != visc_Timer_STOPPED) { fputs("Warning: Subtimer was not stopped\n", stderr); numNotStopped &= 0x2; // Zero out 2^0 } @@ -529,8 +529,8 @@ void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer, return; } - timer->state = hpvm_Timer_RUNNING; - subtimer->state = hpvm_Timer_RUNNING; + timer->state = visc_Timer_RUNNING; + subtimer->state = visc_Timer_RUNNING; #if _POSIX_VERSION >= 200112L { @@ -546,19 +546,19 @@ void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer, } } #else -#error "hpvm_StartTimer: not implemented for this system" +#error "visc_StartTimer: not implemented for this system" #endif } -void hpvm_StopTimer(struct hpvm_Timer *timer) { - hpvm_Timestamp fini; +void visc_StopTimer(struct visc_Timer *timer) { + visc_Timestamp fini; - if (timer->state != hpvm_Timer_RUNNING) { + if (timer->state != visc_Timer_RUNNING) { // fputs("Ignoring attempt to stop a stopped timer\n", stderr); return; } - timer->state = hpvm_Timer_STOPPED; + timer->state = visc_Timer_STOPPED; #if _POSIX_VERSION >= 200112L { @@ -567,24 +567,24 @@ void hpvm_StopTimer(struct hpvm_Timer *timer) { fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "hpvm_StopTimer: not implemented for this system" +#error "visc_StopTimer: not implemented for this system" #endif accumulate_time(&timer->elapsed, timer->init, fini); timer->init = fini; } -void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer, - struct hpvm_Timer *subtimer) { +void visc_StopTimerAndSubTimer(struct visc_Timer *timer, + struct visc_Timer *subtimer) { - hpvm_Timestamp fini; + visc_Timestamp fini; unsigned int numNotRunning = 0x3; // 11 - if (timer->state != hpvm_Timer_RUNNING) { + if (timer->state != visc_Timer_RUNNING) { fputs("Warning: Timer was not running\n", stderr); numNotRunning &= 0x1; // Zero out 2^1 } - if (subtimer->state != hpvm_Timer_RUNNING) { + if (subtimer->state != visc_Timer_RUNNING) { fputs("Warning: Subtimer was not running\n", stderr); numNotRunning &= 0x2; // Zero out 2^0 } @@ -593,8 +593,8 @@ void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer, return; } - timer->state = hpvm_Timer_STOPPED; - subtimer->state = hpvm_Timer_STOPPED; + timer->state = visc_Timer_STOPPED; + subtimer->state = visc_Timer_STOPPED; #if _POSIX_VERSION >= 200112L { @@ -603,7 +603,7 @@ void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer, fini = tv.tv_sec * BILLION + tv.tv_nsec; } #else -#error "hpvm_StopTimer: not implemented for this system" +#error "visc_StopTimer: not implemented for this system" #endif if (numNotRunning & 0x2) { @@ -618,59 +618,59 @@ void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer, } /* Get the elapsed time in seconds. */ -double hpvm_GetElapsedTime(struct hpvm_Timer *timer) { +double visc_GetElapsedTime(struct visc_Timer *timer) { double ret; - if (timer->state != hpvm_Timer_STOPPED) { + if (timer->state != visc_Timer_STOPPED) { fputs("Elapsed time from a running timer is inaccurate\n", stderr); } #if _POSIX_VERSION >= 200112L ret = timer->elapsed / 1e9; #else -#error "hpvm_GetElapsedTime: not implemented for this system" +#error "visc_GetElapsedTime: not implemented for this system" #endif return ret; } -void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers) { +void visc_InitializeTimerSet(struct visc_TimerSet *timers) { int n; timers->wall_begin = get_time(); - timers->current = hpvm_TimerID_NONE; + timers->current = visc_TimerID_NONE; timers->async_markers = NULL; - for (n = 0; n < hpvm_TimerID_LAST; n++) { - hpvm_ResetTimer(&timers->timers[n]); + for (n = 0; n < visc_TimerID_LAST; n++) { + visc_ResetTimer(&timers->timers[n]); timers->sub_timer_list[n] = NULL; } } -void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, - enum hpvm_TimerID hpvm_Category) { +void visc_AddSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID visc_Category) { - struct hpvm_SubTimer *subtimer = - (struct hpvm_SubTimer *)malloc(sizeof(struct hpvm_SubTimer)); + struct visc_SubTimer *subtimer = + (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer)); int len = strlen(label); subtimer->label = (char *)malloc(sizeof(char) * (len + 1)); sprintf(subtimer->label, "%s", label); - hpvm_ResetTimer(&subtimer->timer); + visc_ResetTimer(&subtimer->timer); subtimer->next = NULL; - struct hpvm_SubTimerList *subtimerlist = - timers->sub_timer_list[hpvm_Category]; + struct visc_SubTimerList *subtimerlist = + timers->sub_timer_list[visc_Category]; if (subtimerlist == NULL) { subtimerlist = - (struct hpvm_SubTimerList *)calloc(1, sizeof(struct hpvm_SubTimerList)); + (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList)); subtimerlist->subtimer_list = subtimer; - timers->sub_timer_list[hpvm_Category] = subtimerlist; + timers->sub_timer_list[visc_Category] = subtimerlist; } else { // Append to list - struct hpvm_SubTimer *element = subtimerlist->subtimer_list; + struct visc_SubTimer *element = subtimerlist->subtimer_list; while (element->next != NULL) { element = element->next; } @@ -678,37 +678,37 @@ void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, } } -void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { +void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) { // cerr << "Switch to timer: " << timer << flush << "\n"; /* Stop the currently running timer */ - if (timers->current != hpvm_TimerID_NONE) { - struct hpvm_SubTimerList *subtimerlist = + if (timers->current != visc_TimerID_NONE) { + struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct hpvm_SubTimer *currSubTimer = + struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; if (!is_async(timers->current)) { if (timers->current != timer) { if (currSubTimer != NULL) { - hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], + visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); } else { - hpvm_StopTimer(&timers->timers[timers->current]); + visc_StopTimer(&timers->timers[timers->current]); } } else { if (currSubTimer != NULL) { - hpvm_StopTimer(&currSubTimer->timer); + visc_StopTimer(&currSubTimer->timer); } } } else { insert_marker(timers, timer); if (!is_async(timer)) { // if switching to async too, keep driver going - hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); + visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); } } } - hpvm_Timestamp currentTime = get_time(); + visc_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next @@ -716,7 +716,7 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { if (asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(timer))) { - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; @@ -736,7 +736,7 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { // timer to switch to is COPY or NONE if (async_done != CL_COMPLETE) { - accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), + accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), timers->async_begin, currentTime); } @@ -746,14 +746,14 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { fprintf(stderr, "Error Waiting for Events!\n"); } - hpvm_Timestamp total_async_time = record_async_times(timers); + visc_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ if (async_done == CL_COMPLETE) { // fprintf(stderr, "Async_done: total_async_type = %lld\n", // total_async_time); - timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; + timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; } } else @@ -763,15 +763,15 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { if (async_done == CL_COMPLETE) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[hpvm_TimerID_OVERLAP].elapsed += + timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); } } /* Start the new timer */ - if (timer != hpvm_TimerID_NONE) { + if (timer != visc_TimerID_NONE) { if (!is_async(timer)) { - hpvm_StartTimer(&timers->timers[timer]); + visc_StartTimer(&timers->timers[timer]); } else { // toSwitchTo Is Async (KERNEL/COPY_ASYNC) if (!asyncs_outstanding(timers)) { @@ -785,48 +785,48 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { * so we can rename that marker as the beginning of this async * operation */ - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); last_event->label = NULL; last_event->timerID = timer; } if (!is_async(timers->current)) { - hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); + visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); } } } timers->current = timer; } -void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, - enum hpvm_TimerID category) { - struct hpvm_SubTimerList *subtimerlist = +void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, + enum visc_TimerID category) { + struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct hpvm_SubTimer *curr = + struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; - if (timers->current != hpvm_TimerID_NONE) { + if (timers->current != visc_TimerID_NONE) { if (!is_async(timers->current)) { if (timers->current != category) { if (curr != NULL) { - hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], + visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); } else { - hpvm_StopTimer(&timers->timers[timers->current]); + visc_StopTimer(&timers->timers[timers->current]); } } else { if (curr != NULL) { - hpvm_StopTimer(&curr->timer); + visc_StopTimer(&curr->timer); } } } else { insert_submarker(timers, label, category); if (!is_async(category)) { // if switching to async too, keep driver going - hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); + visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); } } } - hpvm_Timestamp currentTime = get_time(); + visc_Timestamp currentTime = get_time(); /* The only cases we check for asynchronous task completion is * when an overlapping CPU operation completes, or the next @@ -834,7 +834,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, if (asyncs_outstanding(timers) && (!is_async(timers->current) || is_blocking(category))) { - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); /* CL_COMPLETE if completed */ cl_int ciErrNum = CL_SUCCESS; @@ -858,7 +858,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, // because everything is being stopped to wait for synchronization it // seems that the extra sync wall time isn't being recorded anywhere if (async_done != CL_COMPLETE) - accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), + accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), timers->async_begin, currentTime); /* Wait on async operation completion */ @@ -866,7 +866,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Waiting for Events!\n"); } - hpvm_Timestamp total_async_time = record_async_times(timers); + visc_Timestamp total_async_time = record_async_times(timers); /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ @@ -874,7 +874,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, // into OVERLAP the immediately preceding EventSynchronize theoretically // didn't have any effect since it was already completed. if (async_done == CL_COMPLETE /*cudaSuccess*/) - timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; + timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; } else /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ @@ -883,14 +883,14 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, if (async_done == CL_COMPLETE /*cudaSuccess*/) { /* Async operations completed before previous CPU operations: * overlapped time is the total async time */ - timers->timers[hpvm_TimerID_OVERLAP].elapsed += + timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); } // else, this isn't blocking, so just check the next time around } subtimerlist = timers->sub_timer_list[category]; - struct hpvm_SubTimer *subtimer = NULL; + struct visc_SubTimer *subtimer = NULL; if (label != NULL) { subtimer = subtimerlist->subtimer_list; @@ -904,18 +904,18 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, } /* Start the new timer */ - if (category != hpvm_TimerID_NONE) { + if (category != visc_TimerID_NONE) { if (!is_async(category)) { if (subtimerlist != NULL) { subtimerlist->current = subtimer; } if (category != timers->current && subtimer != NULL) { - hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); } else if (subtimer != NULL) { - hpvm_StartTimer(&subtimer->timer); + visc_StartTimer(&subtimer->timer); } else { - hpvm_StartTimer(&timers->timers[category]); + visc_StartTimer(&timers->timers[category]); } } else { if (subtimerlist != NULL) { @@ -933,7 +933,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, * so we can rename that marker as the beginning of this async * operation */ - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); + struct visc_async_time_marker_list *last_event = get_last_async(timers); last_event->timerID = category; last_event->label = label; } // else, marker for switchToThis was already inserted @@ -941,7 +941,7 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, // toSwitchto is already asynchronous, but if current/prev state is async // too, then DRIVER is already running if (!is_async(timers->current)) { - hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); + visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); } } } @@ -949,11 +949,11 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, timers->current = category; } -void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { - hpvm_Timestamp wall_end = get_time(); +void visc_PrintTimerSet(struct visc_TimerSet *timers) { + visc_Timestamp wall_end = get_time(); - struct hpvm_Timer *t = timers->timers; - struct hpvm_SubTimer *sub = NULL; + struct visc_Timer *t = timers->timers; + struct visc_SubTimer *sub = NULL; int maxSubLength; @@ -970,13 +970,13 @@ void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { const int maxCategoryLength = 20; int i; - for (i = 1; i < hpvm_TimerID_LAST; + for (i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if (hpvm_GetElapsedTime(&t[i]) != 0 || true) { + if (visc_GetElapsedTime(&t[i]) != 0 || true) { // Print Category Timer printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1], - hpvm_GetElapsedTime(&t[i])); + visc_GetElapsedTime(&t[i])); if (timers->sub_timer_list[i] != NULL) { sub = timers->sub_timer_list[i]->subtimer_list; @@ -999,24 +999,24 @@ void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { // Print SubTimers while (sub != NULL) { printf(" -%-*s: %.9f\n", maxSubLength, sub->label, - hpvm_GetElapsedTime(&sub->timer)); + visc_GetElapsedTime(&sub->timer)); sub = sub->next; } } } } - if (hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]) != 0) + if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) printf("CPU/Kernel Overlap: %.9f\n", - hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP])); + visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); float walltime = (wall_end - timers->wall_begin) / 1e9; printf("Timer Wall Time: %.9f\n", walltime); } -void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { +void visc_DestroyTimerSet(struct visc_TimerSet *timers) { /* clean up all of the async event markers */ - struct hpvm_async_time_marker_list *event = timers->async_markers; + struct visc_async_time_marker_list *event = timers->async_markers; while (event != NULL) { cl_int ciErrNum = CL_SUCCESS; @@ -1031,7 +1031,7 @@ void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { } free((event)->marker); - struct hpvm_async_time_marker_list *next = ((event)->next); + struct visc_async_time_marker_list *next = ((event)->next); free(event); @@ -1040,10 +1040,10 @@ void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { } int i = 0; - for (i = 0; i < hpvm_TimerID_LAST; ++i) { + for (i = 0; i < visc_TimerID_LAST; ++i) { if (timers->sub_timer_list[i] != NULL) { - struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; - struct hpvm_SubTimer *prev = NULL; + struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct visc_SubTimer *prev = NULL; while (subtimer != NULL) { free(subtimer->label); prev = subtimer; @@ -1059,7 +1059,7 @@ void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { #define BUFFER_SIZE 1 // Launch API for a streaming dataflow graph -void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { +void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { DFNodeContext_X86 *Context = (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); @@ -1081,7 +1081,7 @@ void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) { } // Push API for a streaming dataflow graph -void llvm_hpvm_streamPush(void *graphID, void *args) { +void llvm_visc_streamPush(void *graphID, void *args) { DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; @@ -1094,17 +1094,17 @@ void llvm_hpvm_streamPush(void *graphID, void *args) { if (Ctx->BindInSourcePort->at(j) == i) { // Push to all bind buffers connected to parent node at this port // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(j), element); + llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element); } } } // Push 0 in isLastInput buffers of all child nodes for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers)) - llvm_hpvm_bufferPush(buffer, 0); + llvm_visc_bufferPush(buffer, 0); } // Pop API for a streaming dataflow graph -void *llvm_hpvm_streamPop(void *graphID) { +void *llvm_visc_streamPop(void *graphID) { DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; unsigned totalBytes = 0; @@ -1113,7 +1113,7 @@ void *llvm_hpvm_streamPop(void *graphID) { void *output = malloc(totalBytes); unsigned offset = 0; for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) { - uint64_t element = llvm_hpvm_bufferPop(Ctx->BindOutputBuffers->at(i)); + uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i)); // DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i)); offset += Ctx->BindOutSizes->at(i); @@ -1122,24 +1122,24 @@ void *llvm_hpvm_streamPop(void *graphID) { } // Wait API for a streaming dataflow graph -void llvm_hpvm_streamWait(void *graphID) { +void llvm_visc_streamWait(void *graphID) { DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; // Push garbage to all other input buffers for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) { uint64_t element = 0; // DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(i), element); + llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element); } // Push 1 in isLastInput buffers of all child nodes for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++) - llvm_hpvm_bufferPush(Ctx->isLastInputBuffers->at(i), 1); + llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1); - llvm_hpvm_freeThreads(graphID); + llvm_visc_freeThreads(graphID); } // Create a buffer and return the bufferID -void *llvm_hpvm_createBindInBuffer(void *graphID, uint64_t size, +void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size, unsigned inArgPort) { DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); @@ -1154,7 +1154,7 @@ void *llvm_hpvm_createBindInBuffer(void *graphID, uint64_t size, return bufferID; } -void *llvm_hpvm_createBindOutBuffer(void *graphID, uint64_t size) { +void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1166,7 +1166,7 @@ void *llvm_hpvm_createBindOutBuffer(void *graphID, uint64_t size) { Context->BindOutSizes->push_back(size); return bufferID; } -void *llvm_hpvm_createEdgeBuffer(void *graphID, uint64_t size) { +void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1179,7 +1179,7 @@ void *llvm_hpvm_createEdgeBuffer(void *graphID, uint64_t size) { return bufferID; } -void *llvm_hpvm_createLastInputBuffer(void *graphID, uint64_t size) { +void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) { DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; @@ -1192,7 +1192,7 @@ void *llvm_hpvm_createLastInputBuffer(void *graphID, uint64_t size) { } // Free buffers -void llvm_hpvm_freeBuffers(void *graphID) { +void llvm_visc_freeBuffers(void *graphID) { DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID; for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers)) @@ -1206,19 +1206,19 @@ void llvm_hpvm_freeBuffers(void *graphID) { } // Pop an element from the buffer -uint64_t llvm_hpvm_bufferPop(void *bufferID) { +uint64_t llvm_visc_bufferPop(void *bufferID) { CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; return buffer->pop(); } // Push an element into the buffer -void llvm_hpvm_bufferPush(void *bufferID, uint64_t element) { +void llvm_visc_bufferPush(void *bufferID, uint64_t element) { CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID; buffer->push(element); } // Create a thread -void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), +void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *arguments) { DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n"); @@ -1232,7 +1232,7 @@ void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), } // Wait for thread to finish -void llvm_hpvm_freeThreads(void *graphID) { +void llvm_visc_freeThreads(void *graphID) { DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n"); DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID; for (pthread_t thread : *(Ctx->threads)) @@ -1241,7 +1241,7 @@ void llvm_hpvm_freeThreads(void *graphID) { /************************ OPENCL & PTHREAD API ********************************/ -void *llvm_hpvm_x86_launch(void *(*rootFunc)(void *), void *arguments) { +void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) { DFNodeContext_X86 *Context = (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86)); // int err; @@ -1252,7 +1252,7 @@ void *llvm_hpvm_x86_launch(void *(*rootFunc)(void *), void *arguments) { return Context; } -void llvm_hpvm_x86_wait(void *graphID) { +void llvm_visc_x86_wait(void *graphID) { DEBUG(cout << "Waiting for pthread to finish ...\n"); // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; // pthread_join(Context->threadID, NULL); @@ -1260,9 +1260,9 @@ void llvm_hpvm_x86_wait(void *graphID) { DEBUG(cout << "\t... pthread Done!\n"); } -void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { +void *llvm_visc_ocl_initContext(enum visc::Target T) { pthread_mutex_lock(&ocl_mtx); - DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR"); + DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR"); DEBUG(cout << "Initializing Context for " << Target << " device\n"); cl_uint numPlatforms; cl_int errcode; @@ -1299,10 +1299,10 @@ void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); // Choose second one which is X86 AVX cl_context_properties properties[] = { - CL_CONTEXT_PLATFORM, (long)platforms[T == hpvm::GPU_TARGET ? 0 : 1], 0}; + CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0}; globalOCLContext = clCreateContextFromType( properties, - T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, + T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, NULL, &errcode); // get the list of OCL devices associated with context size_t dataBytes; @@ -1314,7 +1314,7 @@ void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - if (false && T == hpvm::SPIR_TARGET) { + if (false && T == visc::SPIR_TARGET) { cl_device_partition_property props[4]; props[0] = CL_DEVICE_PARTITION_BY_COUNTS; props[1] = NUM_CORES; @@ -1340,13 +1340,13 @@ void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); DEBUG(cout << "Initialize Kernel Timer\n"); - hpvm_InitializeTimerSet(&kernel_timer); + visc_InitializeTimerSet(&kernel_timer); pthread_mutex_unlock(&ocl_mtx); return globalOCLContext; } -void llvm_hpvm_ocl_clearContext(void *graphID) { +void llvm_visc_ocl_clearContext(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Clear Context\n"); DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; @@ -1359,12 +1359,12 @@ void llvm_hpvm_ocl_clearContext(void *graphID) { // DEBUG(cout << "Released context at: " << globalOCLContext); free(Context); DEBUG(cout << "Done with OCL kernel\n"); - cout << "Printing HPVM Timer: KernelTimer\n"; - hpvm_PrintTimerSet(&kernel_timer); + cout << "Printing VISC Timer: KernelTimer\n"; + visc_PrintTimerSet(&kernel_timer); pthread_mutex_unlock(&ocl_mtx); } -void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) { +void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Shared Memory Input:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size @@ -1379,7 +1379,7 @@ void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) { pthread_mutex_unlock(&ocl_mtx); } -void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index, +void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Scalar Input:"); @@ -1395,7 +1395,7 @@ void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index, pthread_mutex_unlock(&ocl_mtx); } -void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index, +void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index, size_t size, bool isInput, bool isOutput) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set Pointer Input:"); @@ -1409,7 +1409,7 @@ void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index, pthread_mutex_unlock(&ocl_mtx); // Check with runtime the location of this memory - cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context, + cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput); pthread_mutex_lock(&ocl_mtx); @@ -1424,7 +1424,7 @@ void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index, return d_input; } -void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) { +void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Set device memory for Output Struct:"); DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size @@ -1446,13 +1446,13 @@ void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) { return d_output; } -void llvm_hpvm_ocl_free(void *ptr) { +void llvm_visc_ocl_free(void *ptr) { // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); // cl_mem d_ptr = (cl_mem) ptr; // clReleaseMemObject(d_ptr); } -void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output, +void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output, size_t size) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Get Output:\n"); @@ -1471,7 +1471,7 @@ void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output, return h_output; } -void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim, +void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim, const size_t *localWorkSize, const size_t *globalWorkSize) { pthread_mutex_lock(&ocl_mtx); @@ -1517,7 +1517,7 @@ void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim, // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); // pthread_mutex_unlock(&ocl_mtx); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION); // for(int i=0 ;i < NUM_TESTS; i++) { // cout << "Iteration = " << i << flush << "\n"; // pthread_mutex_lock(&ocl_mtx); @@ -1530,7 +1530,7 @@ void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim, // pthread_mutex_lock(&ocl_mtx); clFinish(Context->clCommandQue); // pthread_mutex_unlock(&ocl_mtx); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); pthread_mutex_unlock(&ocl_mtx); return event; @@ -1579,7 +1579,7 @@ static char *LoadProgSource(const char *Filename, size_t *szFinalLength) { return cSourceString; } -void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) { +void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Launch OCL Kernel\n"); // Initialize OpenCL @@ -1649,7 +1649,7 @@ void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) { return Context; } -void llvm_hpvm_ocl_wait(void *graphID) { +void llvm_visc_ocl_wait(void *graphID) { pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "Wait\n"); DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; @@ -1659,27 +1659,27 @@ void llvm_hpvm_ocl_wait(void *graphID) { pthread_mutex_unlock(&ocl_mtx); } -void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) { +void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) { // cout << "Switching to timer " << timer << flush << "\n"; pthread_mutex_lock(&ocl_mtx); - // hpvm_SwitchToTimer((hpvm_TimerSet*)(*timerSet), timer); + // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); pthread_mutex_unlock(&ocl_mtx); } -void llvm_hpvm_printTimerSet(void **timerSet, char *timerName) { +void llvm_visc_printTimerSet(void **timerSet, char *timerName) { pthread_mutex_lock(&ocl_mtx); - cout << "Printing HPVM Timer: "; + cout << "Printing VISC Timer: "; if (timerName != NULL) cout << timerName << flush << "\n"; else cout << "Anonymous\n"; - hpvm_PrintTimerSet((hpvm_TimerSet *)(*timerSet)); + visc_PrintTimerSet((visc_TimerSet *)(*timerSet)); pthread_mutex_unlock(&ocl_mtx); } -void *llvm_hpvm_initializeTimerSet() { +void *llvm_visc_initializeTimerSet() { pthread_mutex_lock(&ocl_mtx); - hpvm_TimerSet *TS = (hpvm_TimerSet *)malloc(sizeof(hpvm_TimerSet)); - hpvm_InitializeTimerSet(TS); + visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet)); + visc_InitializeTimerSet(TS); pthread_mutex_unlock(&ocl_mtx); return TS; } diff --git a/hpvm/projects/hpvm-rt/hpvm-rt.h b/hpvm/projects/visc-rt/visc-rt.h similarity index 72% rename from hpvm/projects/hpvm-rt/hpvm-rt.h rename to hpvm/projects/visc-rt/visc-rt.h index 2b6dafba96..3ad315768b 100644 --- a/hpvm/projects/hpvm-rt/hpvm-rt.h +++ b/hpvm/projects/visc-rt/visc-rt.h @@ -2,8 +2,8 @@ * * (c) 2010 The Board of Trustees of the University of Illinois. */ -#ifndef HPVM_RT_HEADER -#define HPVM_RT_HEADER +#ifndef VISC_RT_HEADER +#define VISC_RT_HEADER #include <ctime> #include <iostream> @@ -13,8 +13,8 @@ #include <vector> //#include <condition_variable> -#include "../../include/SupportHPVM/HPVMHint.h" -#include "../../include/SupportHPVM/HPVMTimer.h" +#include "../../include/SupportVISC/VISCHint.h" +#include "../../include/SupportVISC/VISCTimer.h" #include "device_abstraction.h" #include "policy.h" @@ -31,14 +31,14 @@ extern "C" { /************************* Policies *************************************/ -void llvm_hpvm_policy_init(); -void llvm_hpvm_policy_clear(); -int llvm_hpvm_policy_getVersion(const char *, int64_t); +void llvm_visc_policy_init(); +void llvm_visc_policy_clear(); +int llvm_visc_policy_getVersion(const char *, int64_t); /******************** Device Abstraction ********************************/ -void llvm_hpvm_deviceAbstraction_start(); -void llvm_hpvm_deviceAbstraction_end(); -void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus(); +void llvm_visc_deviceAbstraction_start(); +void llvm_visc_deviceAbstraction_end(); +void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); /********************* DFG Depth Stack **********************************/ class DFGDepth { @@ -77,12 +77,12 @@ public: unsigned getNumDim() const { return numDim; } }; -void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, +void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0); -void llvm_hpvm_x86_dstack_pop(); -uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim); -uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim); +void llvm_visc_x86_dstack_pop(); +uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim); +uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim); /********************* Memory Tracker **********************************/ class MemTrackerEntry { @@ -156,32 +156,32 @@ public: } }; -void llvm_hpvm_track_mem(void *, size_t); -void llvm_hpvm_untrack_mem(void *); -void *llvm_hpvm_request_mem(void *, size_t); +void llvm_visc_track_mem(void *, size_t); +void llvm_visc_untrack_mem(void *); +void *llvm_visc_request_mem(void *, size_t); /*********************** OPENCL & PTHREAD API **************************/ -void *llvm_hpvm_x86_launch(void *(void *), void *); -void llvm_hpvm_x86_wait(void *); -void *llvm_hpvm_ocl_initContext(enum hpvm::Target); - -void *llvm_hpvm_x86_argument_ptr(void *, size_t); - -void llvm_hpvm_ocl_clearContext(void *); -void llvm_hpvm_ocl_argument_shared(void *, int, size_t); -void llvm_hpvm_ocl_argument_scalar(void *, void *, int, size_t); -void *llvm_hpvm_ocl_argument_ptr(void *, void *, int, size_t, bool, bool); -void *llvm_hpvm_ocl_output_ptr(void *, int, size_t); -void llvm_hpvm_ocl_free(void *); -void *llvm_hpvm_ocl_getOutput(void *, void *, void *, size_t); -void *llvm_hpvm_ocl_executeNode(void *, unsigned, const size_t *, +void *llvm_visc_x86_launch(void *(void *), void *); +void llvm_visc_x86_wait(void *); +void *llvm_visc_ocl_initContext(enum visc::Target); + +void *llvm_visc_x86_argument_ptr(void *, size_t); + +void llvm_visc_ocl_clearContext(void *); +void llvm_visc_ocl_argument_shared(void *, int, size_t); +void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t); +void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool); +void *llvm_visc_ocl_output_ptr(void *, int, size_t); +void llvm_visc_ocl_free(void *); +void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t); +void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *, const size_t *); -void *llvm_hpvm_ocl_launch(const char *, const char *); -void llvm_hpvm_ocl_wait(void *); +void *llvm_visc_ocl_launch(const char *, const char *); +void llvm_visc_ocl_wait(void *); -void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID); -void llvm_hpvm_printTimerSet(void **timerSet, char *timerName = NULL); -void *llvm_hpvm_initializeTimerSet(); +void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID); +void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL); +void *llvm_visc_initializeTimerSet(); } /*************************** Pipeline API ******************************/ @@ -262,30 +262,30 @@ template <class ElementType> ElementType CircularBuffer<ElementType>::pop() { extern "C" { // Functions to push and pop values from pipeline buffers -uint64_t llvm_hpvm_bufferPop(void *); -void llvm_hpvm_bufferPush(void *, uint64_t); +uint64_t llvm_visc_bufferPop(void *); +void llvm_visc_bufferPush(void *, uint64_t); // Functions to create and destroy buffers -void *llvm_hpvm_createBindInBuffer(void *, uint64_t, unsigned); -void *llvm_hpvm_createBindOutBuffer(void *, uint64_t); -void *llvm_hpvm_createEdgeBuffer(void *, uint64_t); -void *llvm_hpvm_createLastInputBuffer(void *, uint64_t); +void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned); +void *llvm_visc_createBindOutBuffer(void *, uint64_t); +void *llvm_visc_createEdgeBuffer(void *, uint64_t); +void *llvm_visc_createLastInputBuffer(void *, uint64_t); -void llvm_hpvm_freeBuffers(void *); +void llvm_visc_freeBuffers(void *); // Functions to create and destroy threads -void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), void *); -void llvm_hpvm_freeThreads(void *); +void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *); +void llvm_visc_freeThreads(void *); // Launch API for a streaming graph. // Arguments: // (1) Launch Function: void* (void*, void*) // (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined) // (3) Pop Function: void* (std::vector<uint64_t>**, unsigned) -void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *); -void llvm_hpvm_streamPush(void *graphID, void *args); -void *llvm_hpvm_streamPop(void *graphID); -void llvm_hpvm_streamWait(void *graphID); +void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *); +void llvm_visc_streamPush(void *graphID, void *args); +void *llvm_visc_streamPop(void *graphID); +void llvm_visc_streamWait(void *graphID); } -#endif // HPVM_RT_HEADER +#endif // VISC_RT_HEADER diff --git a/hpvm/test/CTestSuite/Makefile b/hpvm/test/CTestSuite/Makefile index 1169e4e896..226a83287d 100644 --- a/hpvm/test/CTestSuite/Makefile +++ b/hpvm/test/CTestSuite/Makefile @@ -9,7 +9,7 @@ LLVM_CC:=$(LLVM_INSTALL)/bin/clang LLVM_OPT:=$(LLVM_INSTALL)/bin/opt BUILD_DIR:=build -all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) +all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -17,10 +17,10 @@ $(BUILD_DIR): $(HOST:%=$(BUILD_DIR)/%.ll):$(BUILD_DIR)/%.ll:%.c $(LLVM_CC) -S -emit-llvm $< -O3 -o $@ -$(HOST:%=$(BUILD_DIR)/%.hpvm.ll):$(BUILD_DIR)/%.hpvm.ll:$(BUILD_DIR)/%.ll - $(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenHPVM.so -genhpvm -globaldce $< -S -o $@ +$(HOST:%=$(BUILD_DIR)/%.visc.ll):$(BUILD_DIR)/%.visc.ll:$(BUILD_DIR)/%.ll + $(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenVISC.so -genvisc -globaldce $< -S -o $@ @cat RUN.script $@ > $@.tmp @mv $@.tmp $@ clean : - rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.nvptx.s) $(BUILD_DIR)/DataflowGraph.dot* + rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.nvptx.s) $(BUILD_DIR)/DataflowGraph.dot* diff --git a/hpvm/test/CTestSuite/RUN.script b/hpvm/test/CTestSuite/RUN.script index 23fa1694eb..10bf667818 100644 --- a/hpvm/test/CTestSuite/RUN.script +++ b/hpvm/test/CTestSuite/RUN.script @@ -1,6 +1,6 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin ; RUN: %t.bin diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c index eb0a3c5e92..d0a69ba25c 100644 --- a/hpvm/test/CTestSuite/gemm.c +++ b/hpvm/test/CTestSuite/gemm.c @@ -54,14 +54,14 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) { return 1; // Success } -// Dummy hpvm node execution call -// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned), +// Dummy visc node execution call +// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* // outputs); void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { - __hpvm__attributes(2, A, B, 1, C); + __visc__attributes(2, A, B, 1, C); // printf("Entered function\n"); int tx = get_local_id(0); // 2D Global Thread ID x int ty = get_local_id(1); // 2D Global Thread ID y @@ -130,10 +130,10 @@ int main(int argc, char **argv) { // Compute using OpenCL // matrixMul(h_A, h_B, h_C, WA, WB); - //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); - unsigned graphMM = __hpvm__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, + //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); + unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); - __hpvm__wait(graphMM); + __visc__wait(graphMM); if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c index df45559363..bd7ab27fc0 100644 --- a/hpvm/test/CTestSuite/gemm_2.c +++ b/hpvm/test/CTestSuite/gemm_2.c @@ -54,13 +54,13 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) { return 1; // Success } -// Dummy hpvm node execution call -// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned), +// Dummy visc node execution call +// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* // outputs); void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) { - __hpvm__attributes(2, A, B, 1, C); + __visc__attributes(2, A, B, 1, C); // printf("Entered function\n"); int tx = get_global_id(0); // 2D Global Thread ID x @@ -130,11 +130,11 @@ int main(int argc, char **argv) { // Compute using OpenCL // matrixMul(h_A, h_B, h_C, WA, WB); - //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); + //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0); unsigned graphMM = - __hpvm__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A, + __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0); - __hpvm__wait(graphMM); + __visc__wait(graphMM); if (checkResults(h_A, h_B, h_C)) printf("\nPass!\n"); else diff --git a/hpvm/test/hpvm-cava/.gitignore b/hpvm/test/hpvm-cava/.gitignore index f08b880bf9..2fc1b23564 100644 --- a/hpvm/test/hpvm-cava/.gitignore +++ b/hpvm/test/hpvm-cava/.gitignore @@ -1,5 +1,5 @@ build/ -cava-hpvm +cava-visc Makefile.config example-face/*.bin diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile index 7530477f3d..62219a1cb0 100644 --- a/hpvm/test/hpvm-cava/Makefile +++ b/hpvm/test/hpvm-cava/Makefile @@ -26,21 +26,21 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR) -INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include +INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include ifneq ($(CONFUSE_ROOT),) INCLUDES += -I$(CONFUSE_ROOT)/include LFLAGS += -L$(CONFUSE_ROOT)/lib endif -EXE = cava-hpvm-$(VERSION)-$(TARGET) +EXE = cava-visc-$(VERSION)-$(TARGET) LFLAGS += -pthread ## BEGIN HPVM MAKEFILE -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP = $(EXE) APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3 @@ -52,23 +52,23 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt +VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt -HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll -TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG - HPVM_OPTFLAGS += -hpvm-timers-x86 + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 else DEVICE = GPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx endif - TESTGEN_OPTFLAGS += -hpvm-timers-gen + TESTGEN_OPTFLAGS += -visc-timers-gen CFLAGS += -DDEVICE=$(DEVICE) CXXFLAGS += -DDEVICE=$(DEVICE) @@ -79,7 +79,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1)) .PRECIOUS: $(BUILD_DIR)/%.ll OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll ifeq ($(TARGET),seq) @@ -107,14 +107,14 @@ $(KERNEL_OCL) : $(KERNEL) $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp +$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS) - $(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) + $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -125,7 +125,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c $(CC) $(CFLAGS) -emit-llvm -S -o $@ $< -$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@ +$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll + $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/hpvm-cava/Makefile.config.example b/hpvm/test/hpvm-cava/Makefile.config.example index 8cbe04af78..269f0b7df2 100644 --- a/hpvm/test/hpvm-cava/Makefile.config.example +++ b/hpvm/test/hpvm-cava/Makefile.config.example @@ -4,20 +4,20 @@ OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib # NOTE: You may need to configure this based on your root path. -HPVM_SRC_ROOT=$(LLVM_SRC_ROOT) +VISC_SRC_ROOT=$(LLVM_SRC_ROOT) -HPVM_BUILD_DIR =$(HPVM_SRC_ROOT)/build -CC = $(HPVM_BUILD_DIR)/bin/clang -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include +VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build +CC = $(VISC_BUILD_DIR)/bin/clang +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -CXX = $(HPVM_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include +CXX = $(VISC_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -LINKER = $(HPVM_BUILD_DIR)/bin/clang++ +LINKER = $(VISC_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH) -LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib -LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin +LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib +LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/hpvm-cava/README.md b/hpvm/test/hpvm-cava/README.md index 1106c4781b..890b629d17 100644 --- a/hpvm/test/hpvm-cava/README.md +++ b/hpvm/test/hpvm-cava/README.md @@ -12,7 +12,7 @@ See the original camera/vision pipeline repo (repo: `yaoyuannnn/cava`) for detai After building HPVM, the following steps are required to build and run the camera pipeline: 1. Build with `make TARGET=seq` for CPU and `make TARGET=gpu` for gpu. -2. Run with `./cava-hpvm-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. +2. Run with `./cava-visc-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. * `<Target>` can be either `seq` or `gpu` depending on what target is used to build. * This processes the raw image `example-tulip-small/raw_tulip-small.bin`. Note that raw images are different from bitmaps, so you might need to obtain them using special software. * This generates: `tulip-small.bin` and `tulip-small-<stage>.bin` where `<stage>` represents the stage of the pipeline. diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c index 4188c9e860..e43bbb4f25 100644 --- a/hpvm/test/hpvm-cava/src/main.c +++ b/hpvm/test/hpvm-cava/src/main.c @@ -1,154 +1,136 @@ -#include "utility.h" #include <argp.h> -#include <assert.h> -#include <math.h> #include <stdio.h> #include <stdlib.h> +#include <assert.h> #include <string.h> +#include <math.h> +#include "utility.h" #include "cam_pipe_utility.h" -#include "load_cam_model.h" #include "pipe_stages.h" +#include "load_cam_model.h" -#include "hpvm.h" +#include "visc.h" int NUM_TEST_CASES; int NUM_CLASSES; int INPUT_DIM; int NUM_WORKER_THREADS; -// Type of struct holding the return value from the last node. -struct RetStruct { - size_t bytesRet; -}; - // Type of struct that is used to pass arguments to the HPVM dataflow graph // using the hpvm launch operation typedef struct __attribute__((__packed__)) { - uint8_t *input; - size_t bytes_input; - uint8_t *result; - size_t bytes_result; - float *input_scaled; - size_t bytes_input_scaled; - float *result_scaled; - size_t bytes_result_scaled; - float *demosaic_out; - size_t bytes_demosaic_out; - float *denoise_out; - size_t bytes_denoise_out; - float *transform_out; - size_t bytes_transform_out; - float *gamut_out; - size_t bytes_gamut_out; - float *TsTw; - size_t bytes_TsTw; - float *ctrl_pts; - size_t bytes_ctrl_pts; - float *weights; - size_t bytes_weights; - float *coefs; - size_t bytes_coefs; - float *l2_dist; - size_t bytes_l2_dist; - float *tone_map; - size_t bytes_tone_map; - int row_size; - int col_size; - struct RetStruct ret; // Instance of RetStruct holding the return value. -} RootIn; + uint8_t *input; size_t bytes_input; + uint8_t *result; size_t bytes_result; + float *input_scaled; size_t bytes_input_scaled; + float *result_scaled; size_t bytes_result_scaled; + float *demosaic_out; size_t bytes_demosaic_out; + float *denoise_out; size_t bytes_denoise_out; + float *transform_out; size_t bytes_transform_out; + float *gamut_out;size_t bytes_gamut_out; + float *TsTw; size_t bytes_TsTw; + float *ctrl_pts; size_t bytes_ctrl_pts; + float *weights; size_t bytes_weights; + float*coefs; size_t bytes_coefs; + float *l2_dist; size_t bytes_l2_dist; + float *tone_map; size_t bytes_tone_map; + size_t row_size; size_t col_size; +} +RootIn; typedef enum _argnum { - RAW_IMAGE_BIN, - OUTPUT_IMAGE_BIN, - NUM_REQUIRED_ARGS, - DATA_FILE = NUM_REQUIRED_ARGS, - NUM_ARGS, + RAW_IMAGE_BIN, + OUTPUT_IMAGE_BIN, + NUM_REQUIRED_ARGS, + DATA_FILE = NUM_REQUIRED_ARGS, + NUM_ARGS, } argnum; typedef struct _arguments { - char *args[NUM_ARGS]; - int num_inputs; - int num_threads; + char* args[NUM_ARGS]; + int num_inputs; + int num_threads; } arguments; static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n"; static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary"; static struct argp_option options[] = { - {"num-inputs", 'n', "N", 0, "Number of input images"}, - {0}, - {"data-file", 'f', "F", 0, - "File to read data and weights from (if data-init-mode == READ_FILE or " - "save-params is true). *.txt files are decoded as text files, while " - "*.bin files are decoded as binary files."}, + { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 }, + { "data-file", 'f', "F", 0, + "File to read data and weights from (if data-init-mode == READ_FILE or " + "save-params is true). *.txt files are decoded as text files, while " + "*.bin files are decoded as binary files." }, }; -static error_t parse_opt(int key, char *arg, struct argp_state *state) { - arguments *args = (arguments *)(state->input); - switch (key) { - case 'n': { - args->num_inputs = strtol(arg, NULL, 10); - break; - } - case 'f': { - args->args[DATA_FILE] = arg; - break; - } - case 't': { - args->num_threads = strtol(arg, NULL, 10); - break; - } - case ARGP_KEY_ARG: { - if (state->arg_num >= NUM_REQUIRED_ARGS) - argp_usage(state); - args->args[state->arg_num] = arg; - break; - } - case ARGP_KEY_END: { - if (state->arg_num < NUM_REQUIRED_ARGS) { - fprintf(stderr, "Not enough arguments! Got %d, require %d.\n", - state->arg_num, NUM_REQUIRED_ARGS); - argp_usage(state); +static error_t parse_opt(int key, char* arg, struct argp_state* state) { + arguments* args = (arguments*)(state->input); + switch (key) { + case 'n': { + args->num_inputs = strtol(arg, NULL, 10); + break; + } + case 'f': { + args->args[DATA_FILE] = arg; + break; + } + case 't': { + args->num_threads = strtol(arg, NULL, 10); + break; + } + case ARGP_KEY_ARG: { + if (state->arg_num >= NUM_REQUIRED_ARGS) + argp_usage(state); + args->args[state->arg_num] = arg; + break; + } + case ARGP_KEY_END: { + if (state->arg_num < NUM_REQUIRED_ARGS) { + fprintf(stderr, + "Not enough arguments! Got %d, require %d.\n", + state->arg_num, + NUM_REQUIRED_ARGS); + argp_usage(state); + } + break; + } + default: + return ARGP_ERR_UNKNOWN; } - break; - } - default: - return ARGP_ERR_UNKNOWN; - } - return 0; + return 0; } -void set_default_args(arguments *args) { - args->num_inputs = 1; - args->num_threads = 0; - for (int i = 0; i < NUM_ARGS; i++) { - args->args[i] = NULL; - } +void set_default_args(arguments* args) { + args->num_inputs = 1; + args->num_threads = 0; + for (int i = 0; i < NUM_ARGS; i++) { + args->args[i] = NULL; + } } -static struct argp parser = {options, parse_opt, args_doc, prog_doc}; +static struct argp parser = { options, parse_opt, args_doc, prog_doc }; // Helper function for printing intermediate results -void descale_cpu(float *input, size_t bytes_input, uint8_t *output, - size_t bytes_result, size_t row_size, size_t col_size) { - +void descale_cpu(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + size_t row_size, size_t col_size) { + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; + int index = (chan*row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } } static void sort(float arr[], int n) { - int i, j; - for (i = 0; i < n - 1; i++) - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; + for (i = 0; i < n - 1; i++) + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } /**************************************************************/ @@ -158,259 +140,256 @@ static void sort(float arr[], int n) { // In this benchmark, no use of HPVM query intrinsics in the leaf node functions // Leaf HPVM node function for scale -void scale_fxp(uint8_t *input, size_t bytes_input, float *output, - size_t bytes_output, size_t row_size, size_t col_size) { +void scale_fxp(uint8_t *input, size_t bytes_input, + float *output, size_t bytes_output, + size_t row_size, size_t col_size) { - // Specifies compilation target for current node - __hpvm__hint(CPU_TARGET); + //Specifies compilation target for current node + __visc__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" - __hpvm__attributes(2, input, output, 1, output); - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); + __visc__attributes(2, input, output, 1, output); + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) - // for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - output[index] = input[index] * 1.0 / 255; - } - __hpvm__return(1, bytes_output); +// for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++){ + int index = (chan*row_size + row) * col_size + col; + output[index] = input[index] * 1.0 / 255; + } + __visc__return(1, bytes_output); } // Leaf HPVM node function for descale -void descale_fxp(float *input, size_t bytes_input, uint8_t *output, - size_t bytes_result, size_t row_size, size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, output, 1, output); - +void descale_fxp(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, output, 1, output); + for (int chan = 0; chan < CHAN_SIZE; chan++) for (int row = 0; row < row_size; row++) for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; + int index = (chan*row_size + row) * col_size + col; output[index] = min(max(input[index] * 255, 0), 255); } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } // Leaf HPVM node function for demosaicing -void demosaic_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, size_t row_size, size_t col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(2, input, result, 1, result); - - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); - // for (int row = 1; row < row_size - 1; row++) - for (int col = 1; col < col_size - 1; col++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = input[index_0 - 1]; - float R2 = input[index_0 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size]; - float B2 = input[index_2 + col_size]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // Getting the B values - float B1 = input[index_2 - col_size - 1]; - float B2 = input[index_2 - col_size + 1]; - float B3 = input[index_2 + col_size - 1]; - float B4 = input[index_2 + col_size + 1]; - // R - result[index_0] = input[index_0]; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - result[index_2] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = input[index_0 - col_size - 1]; - float R2 = input[index_0 + col_size - 1]; - float R3 = input[index_0 - col_size + 1]; - float R4 = input[index_0 + col_size + 1]; - // Getting the G values - float G1 = input[index_1 - col_size]; - float G2 = input[index_1 + col_size]; - float G3 = input[index_1 - 1]; - float G4 = input[index_1 + 1]; - // R - result[index_0] = (R1 + R2 + R3 + R4) / 4; - // G - result[index_1] = (G1 + G2 + G3 + G4) / 2; - // B - result[index_2] = input[index_2]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = input[index_0 - col_size]; - float R2 = input[index_0 + col_size]; - // Getting the B values - float B1 = input[index_2 - 1]; - float B2 = input[index_2 + 1]; - // R - result[index_0] = (R1 + R2) / 2; - // G - result[index_1] = input[index_1] * 2; - // B - result[index_2] = (B1 + B2) / 2; - } - } - __hpvm__return(1, bytes_result); +void demosaic_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); +// for (int row = 1; row < row_size - 1; row++) + for (int col = 1; col < col_size - 1; col++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = input[index_0 - 1]; + float R2 = input[index_0 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size]; + float B2 = input[index_2 + col_size]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size - 1]; + float B2 = input[index_2 - col_size + 1]; + float B3 = input[index_2 + col_size - 1]; + float B4 = input[index_2 + col_size + 1]; + // R + result[index_0] = input[index_0]; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + result[index_2] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = input[index_0 - col_size - 1]; + float R2 = input[index_0 + col_size - 1]; + float R3 = input[index_0 - col_size + 1]; + float R4 = input[index_0 + col_size + 1]; + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // R + result[index_0] = (R1 + R2 + R3 + R4) / 4; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B + result[index_2] = input[index_2]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = input[index_0 - col_size]; + float R2 = input[index_0 + col_size]; + // Getting the B values + float B1 = input[index_2 - 1]; + float B2 = input[index_2 + 1]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } + } + __visc__return(1, bytes_result); } // Leaf HPVM node function for denoise -void denoise_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, size_t row_size, size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, result, 1, result); - - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); +void denoise_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) - // for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) - if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { - float filter[9]; - for (int i = -1; i < 2; i++) - for (int j = -1; j < 2; j++) { - int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1; - filter[index] = - input[(chan * row_size + (i + row)) * col_size + (j + col)]; - } - sort(filter, 9); - result[(chan * row_size + row) * col_size + col] = filter[4]; - } else { - result[(chan * row_size + row) * col_size + col] = - input[(chan * row_size + row) * col_size + col]; - } - __hpvm__return(1, bytes_result); +// for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) + if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { + float filter[9]; + for (int i = -1; i < 2; i++) + for (int j = -1; j < 2; j++) { + int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1; + filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)]; + } + sort(filter, 9); + result[(chan * row_size + row) * col_size + col] = filter[4]; + } else { + result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col]; + } + __visc__return(1, bytes_result); } // Leaf HPVM node function, for color map and white balance transform -void transform_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, size_t row_size, size_t col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(3, input, result, TsTw_tran, 1, result); - - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); + __visc__hint(DEVICE); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) - // for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - int index_2d_0 = 0 * CHAN_SIZE + chan; - int index_2d_1 = 1 * CHAN_SIZE + chan; - int index_2d_2 = 2 * CHAN_SIZE + chan; - result[index] = max(input[index_0] * TsTw_tran[index_2d_0] + - input[index_1] * TsTw_tran[index_2d_1] + - input[index_2] * TsTw_tran[index_2d_2], - 0); - } - __hpvm__return(1, bytes_result); +// for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + int index_2d_0 = 0 * CHAN_SIZE + chan; + int index_2d_1 = 1 * CHAN_SIZE + chan; + int index_2d_2 = 2 * CHAN_SIZE + chan; + result[index] = + max(input[index_0] * TsTw_tran[index_2d_0] + + input[index_1] * TsTw_tran[index_2d_1] + + input[index_2] * TsTw_tran[index_2d_2], + 0); + } + __visc__return(1, bytes_result); } // Leaf HPVM node function, for gamut mapping -void gamut_map_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, float *coefs, - size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, +void gamut_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, size_t row_size, size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, - result, l2_dist); - - // First, get the L2 norm from every pixel to the control points, - // Then, sum it and weight it. Finally, add the bias. - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); - // for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - float chan_val_0 = 0.0; - float chan_val_1 = 0.0; - float chan_val_2 = 0.0; - for (int cp = 0; cp < 3702; cp++) { - int index_0 = (0 * row_size + row) * col_size + col; - int index_1 = (1 * row_size + row) * col_size + col; - int index_2 = (2 * row_size + row) * col_size + col; - float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); - float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); - float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); - float val = val1 * val2 + val3 * val4 + val5 * val6; - float sqrt_val = sqrt(val); - chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; - chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; - chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; + __visc__hint(CPU_TARGET); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist); + + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); +// for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + float chan_val_0 = 0.0; + float chan_val_1 = 0.0; + float chan_val_2 = 0.0; + for (int cp = 0; cp < 3702; cp++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val = val1 * val2 + val3 * val4 + val5 * val6; + float sqrt_val = sqrt(val); + chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0]; + chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1]; + chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2]; + } + chan_val_0 += coefs[0 * CHAN_SIZE + 0] + + coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; + chan_val_1 += coefs[0 * CHAN_SIZE + 1] + + coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; + chan_val_2 += coefs[0 * CHAN_SIZE + 2] + + coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; + result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); + result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); + result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); } - chan_val_0 += - coefs[0 * CHAN_SIZE + 0] + - coefs[1 * CHAN_SIZE + 0] * - input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 0] * - input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col]; - chan_val_1 += - coefs[0 * CHAN_SIZE + 1] + - coefs[1 * CHAN_SIZE + 1] * - input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 1] * - input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col]; - chan_val_2 += - coefs[0 * CHAN_SIZE + 2] + - coefs[1 * CHAN_SIZE + 2] * - input[(0 * row_size + row) * col_size + col] + - coefs[2 * CHAN_SIZE + 2] * - input[(1 * row_size + row) * col_size + col] + - coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col]; - result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0); - result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0); - result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0); - } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } // HPVM leaf node function, for tone mapping -void tone_map_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, size_t row_size, size_t col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(3, input, result, tone_map, 1, result); - - void *thisNode = __hpvm__getNode(); - int row = __hpvm__getNodeInstanceID_x(thisNode); + __visc__hint(DEVICE); + __visc__attributes(3, input, result, tone_map, 1, result); + + void* thisNode = __visc__getNode(); + int row = __visc__getNodeInstanceID_x(thisNode); for (int chan = 0; chan < CHAN_SIZE; chan++) - // for (int row = 0; row < row_size; row++) - for (int col = 0; col < col_size; col++) { - int index = (chan * row_size + row) * col_size + col; - uint8_t x = input[index] * 255; - result[index] = tone_map[x * CHAN_SIZE + chan]; - } - __hpvm__return(1, bytes_result); +// for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + uint8_t x = input[index] * 255; + result[index] = tone_map[x * CHAN_SIZE + chan]; + } + __visc__return(1, bytes_result); } /********************************************************************/ @@ -421,184 +400,185 @@ void tone_map_fxp(float *input, size_t bytes_input, float *result, // requirement for the FPGA backend . The CPU backend also supports this, // so it does not cause a portability issue. -void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result, - size_t bytes_result, size_t row_size, size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, result, 1, result); +void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, + float *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic // instance (last argument) associated with node function scale_fxp - void *ScaleNode = __hpvm__createNodeND(1, scale_fxp, row_size); + void *ScaleNode = __visc__createNodeND(1, scale_fxp, row_size); // Binds inputs of current node with specified node // - destination node // - argument position in argument list of function of source node // - argument position in argument list of function of destination node // - streaming (1) or non-streaming (0) - __hpvm__bindIn(ScaleNode, 0, 0, 0); // bind input - __hpvm__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(ScaleNode, 2, 2, 0); // bind result - __hpvm__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(ScaleNode, 4, 4, 0); // bind row_size - __hpvm__bindIn(ScaleNode, 5, 5, 0); // bind col_size + __visc__bindIn(ScaleNode, 0, 0, 0); // bind input + __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(ScaleNode, 2, 2, 0); // bind result + __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size + __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size // Similar to bindIn, but for the output. Output of a node is a struct, and // we consider the fields in increasing ordering. - __hpvm__bindOut(ScaleNode, 0, 0, 0); + __visc__bindOut(ScaleNode, 0, 0, 0); } -void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result, - size_t bytes_result, size_t row_size, - size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, result, 1, result); - void *DescaleNode = __hpvm__createNodeND(1, descale_fxp, row_size); - __hpvm__bindIn(DescaleNode, 0, 0, 0); // bind input - __hpvm__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(DescaleNode, 2, 2, 0); // bind result - __hpvm__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(DescaleNode, 4, 4, 0); // bind row_size - __hpvm__bindIn(DescaleNode, 5, 5, 0); // bind col_size - - __hpvm__bindOut(DescaleNode, 0, 0, 0); +void descale_fxp_wrapper(float *input, size_t bytes_input, + uint8_t *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size); + __visc__bindIn(DescaleNode, 0, 0, 0); // bind input + __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DescaleNode, 2, 2, 0); // bind result + __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size + __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DescaleNode, 0, 0, 0); } -void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result, - size_t bytes_result, size_t row_size, - size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, result, 1, result); - void *DemosaicNode = __hpvm__createNodeND(1, demosaic_fxp, row_size); - __hpvm__bindIn(DemosaicNode, 0, 0, 0); // bind input - __hpvm__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(DemosaicNode, 2, 2, 0); // bind result - __hpvm__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(DemosaicNode, 4, 4, 0); // bind row_size - __hpvm__bindIn(DemosaicNode, 5, 5, 0); // bind col_size - - __hpvm__bindOut(DemosaicNode, 0, 0, 0); +void demosaic_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size); + __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input + __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result + __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size + __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DemosaicNode, 0, 0, 0); } -void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result, - size_t bytes_result, size_t row_size, - size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, input, result, 1, result); - void *DenoiseNode = __hpvm__createNodeND(1, denoise_fxp, row_size); - __hpvm__bindIn(DenoiseNode, 0, 0, 0); // bind input - __hpvm__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(DenoiseNode, 2, 2, 0); // bind result - __hpvm__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(DenoiseNode, 4, 4, 0); // bind row_size - __hpvm__bindIn(DenoiseNode, 5, 5, 0); // bind col_size - - __hpvm__bindOut(DenoiseNode, 0, 0, 0); +void denoise_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size); + __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input + __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result + __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size + __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DenoiseNode, 0, 0, 0); } -void transform_fxp_wrapper(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *TsTw_tran, - size_t bytes_TsTw, size_t row_size, - size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(3, input, result, TsTw_tran, 1, result); - void *TransformNode = __hpvm__createNodeND(1, transform_fxp, row_size); - __hpvm__bindIn(TransformNode, 0, 0, 0); // bind input - __hpvm__bindIn(TransformNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(TransformNode, 2, 2, 0); // bind result - __hpvm__bindIn(TransformNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(TransformNode, 4, 4, 0); // bind tstw - __hpvm__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw - __hpvm__bindIn(TransformNode, 6, 6, 0); // bind row_size - __hpvm__bindIn(TransformNode, 7, 7, 0); // bind col_size - - __hpvm__bindOut(TransformNode, 0, 0, 0); +void transform_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size); + __visc__bindIn(TransformNode, 0, 0, 0); // bind input + __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(TransformNode, 2, 2, 0); // bind result + __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw + __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw + __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size + __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size + + __visc__bindOut(TransformNode, 0, 0, 0); } -void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *ctrl_pts, - size_t bytes_ctrl_pts, float *weights, - size_t bytes_weights, float *coefs, size_t bytes_coefs, - float *l2_dist, size_t bytes_l2_dist, size_t row_size, - size_t col_size) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, - result); - void *GamutNode = __hpvm__createNodeND(1, gamut_map_fxp, row_size); - __hpvm__bindIn(GamutNode, 0, 0, 0); // bind input - __hpvm__bindIn(GamutNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(GamutNode, 2, 2, 0); // bind result - __hpvm__bindIn(GamutNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts - __hpvm__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts - __hpvm__bindIn(GamutNode, 6, 6, 0); // bind weights - __hpvm__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights - __hpvm__bindIn(GamutNode, 8, 8, 0); // bind coefs - __hpvm__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs - __hpvm__bindIn(GamutNode, 10, 10, 0); // bind l2_dist - __hpvm__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist - __hpvm__bindIn(GamutNode, 12, 12, 0); // bind row_size - __hpvm__bindIn(GamutNode, 13, 13, 0); // bind col_size - - __hpvm__bindOut(GamutNode, 0, 0, 0); +void gamut_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, + size_t row_size, size_t col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); + void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size); + __visc__bindIn(GamutNode, 0, 0, 0); // bind input + __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(GamutNode, 2, 2, 0); // bind result + __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts + __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts + __visc__bindIn(GamutNode, 6, 6, 0); // bind weights + __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights + __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs + __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs + __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist + __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist + __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size + __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size + + __visc__bindOut(GamutNode, 0, 0, 0); } -void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *tone_map, - size_t bytes_tone_map, size_t row_size, - size_t col_size) { - - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(3, input, result, tone_map, 1, result); - void *ToneMapNode = __hpvm__createNodeND(1, tone_map_fxp, row_size); - __hpvm__bindIn(ToneMapNode, 0, 0, 0); // bind input - __hpvm__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input - __hpvm__bindIn(ToneMapNode, 2, 2, 0); // bind result - __hpvm__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result - __hpvm__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map - __hpvm__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map - __hpvm__bindIn(ToneMapNode, 6, 6, 0); // bind row_size - __hpvm__bindIn(ToneMapNode, 7, 7, 0); // bind col_size - - __hpvm__bindOut(ToneMapNode, 0, 0, 0); +void tone_map_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, + size_t row_size, size_t col_size) { + + __visc__hint(CPU_TARGET); + __visc__attributes(3, input, result, tone_map, 1, result); + void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, row_size); + __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input + __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result + __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map + __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map + __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size + __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size + + __visc__bindOut(ToneMapNode, 0, 0, 0); } + /*** ROOT Node - Top Level of the Graph Hierarchy ***/ -void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, - /*2*/ uint8_t *result, /*3*/ size_t bytes_result, - /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, - /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, - /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, - /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, - /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, - /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, - /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, - /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, - /*20*/ float *weights, /*21*/ size_t bytes_weights, - /*22*/ float *coefs, /*23*/ size_t bytes_coefs, - /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, - /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, - /*28*/ size_t row_size, /*29*/ size_t col_size) { - - // Specifies compilation target for current node - __hpvm__hint(CPU_TARGET); +void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, + /*2*/ uint8_t *result, /*3*/ size_t bytes_result, + /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, + /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, + /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, + /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, + /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, + /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, + /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, + /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, + /*20*/ float *weights, /*21*/ size_t bytes_weights, + /*22*/ float*coefs, /*23*/ size_t bytes_coefs, + /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, + /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, + /*28*/ size_t row_size, /*29*/ size_t col_size) { + + //Specifies compilation target for current node + __visc__hint(CPU_TARGET); // Specifies pointer arguments that will be used as "in" and "out" arguments // - count of "in" arguments // - list of "in" argument , and similar for "out" - __hpvm__attributes(14, input, result, input_scaled, result_scaled, - demosaic_out, denoise_out, transform_out, gamut_out, TsTw, - ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result, - demosaic_out, denoise_out, transform_out, gamut_out); + __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, + transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, + 5, result, demosaic_out, denoise_out, transform_out, gamut_out); // Create an 0D (specified by 1st argument) HPVM node - so a single node // associated with node function ---_fxp_wrapper - void *ScNode = __hpvm__createNodeND(0, scale_fxp_wrapper); - void *DmNode = __hpvm__createNodeND(0, demosaic_fxp_wrapper); - void *DnNode = __hpvm__createNodeND(0, denoise_fxp_wrapper); - void *TrNode = __hpvm__createNodeND(0, transform_fxp_wrapper); - void *GmNode = __hpvm__createNodeND(0, gamut_fxp_wrapper); - void *TnNode = __hpvm__createNodeND(0, tone_map_fxp_wrapper); - void *DsNode = __hpvm__createNodeND(0, descale_fxp_wrapper); - + void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper); + void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper); + void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper); + void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper); + void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper); + void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper); + void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper); + // BindIn binds inputs of current node with specified node // - destination node // - argument position in argument list of function of source node @@ -612,283 +592,268 @@ void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, // - destination position (in argument list of destination node) // - streaming (1) or non-streaming (0) - // scale_fxp inputs - __hpvm__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input - __hpvm__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input - __hpvm__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result - __hpvm__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result - __hpvm__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size - __hpvm__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size - - // demosaic_fxp inputs - __hpvm__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input - __hpvm__edge(ScNode, DmNode, 1, 0, 1, - 0); // SCNode:bytes_result -> DmNode:bytes_input - __hpvm__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result - __hpvm__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result - __hpvm__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size - __hpvm__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size - - // denoise_fxp inputs - __hpvm__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input - __hpvm__edge(DmNode, DnNode, 1, 0, 1, - 0); // DMNode:bytes_result -> DnNode:bytes_input - __hpvm__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result - __hpvm__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result - __hpvm__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size - __hpvm__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size - - // transform_fxp inputs - __hpvm__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input - __hpvm__edge(DnNode, TrNode, 1, 0, 1, - 0); // DnNode:bytes_result -> TrNode:bytes_input - __hpvm__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result - __hpvm__bindIn(TrNode, 13, 3, - 0); // bytes_result_scaled -> TrNode:bytes_result - __hpvm__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann - __hpvm__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw - __hpvm__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size - __hpvm__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size - - // gamut_fxp inputs - __hpvm__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input - __hpvm__edge(TrNode, GmNode, 1, 0, 1, - 0); // TrNode:bytes_result -> GmNode:bytes_input - __hpvm__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result - __hpvm__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result - __hpvm__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts - __hpvm__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts - __hpvm__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights - __hpvm__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights - __hpvm__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs - __hpvm__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs - __hpvm__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist - __hpvm__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist - __hpvm__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size - __hpvm__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size - - // tone_map_fxp inputs - __hpvm__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input - __hpvm__edge(GmNode, TnNode, 1, 0, 1, - 0); // GmNode:bytes_result -> TnNode:bytes_input - __hpvm__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result - __hpvm__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result - __hpvm__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map - __hpvm__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map - __hpvm__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size - __hpvm__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size - - // descale_fxp inputs - __hpvm__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input - __hpvm__edge(TnNode, DsNode, 1, 0, 1, - 0); // TnNode:bytes_result -> DsNode:bytes_input - __hpvm__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result - __hpvm__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result - __hpvm__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size - __hpvm__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size + // scale_fxp inputs + __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input + __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result + __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result + __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size + __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size + + // demosaic_fxp inputs + __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input + __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input + __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result + __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result + __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size + __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size + + // denoise_fxp inputs + __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input + __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input + __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result + __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result + __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size + __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size + + // transform_fxp inputs + __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input + __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input + __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result + __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result + __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann + __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw + __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size + __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size + + // gamut_fxp inputs + __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input + __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input + __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result + __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result + __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts + __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts + __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights + __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights + __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs + __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs + __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist + __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist + __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size + __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size + + // tone_map_fxp inputs + __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input + __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input + __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result + __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result + __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map + __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map + __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size + __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size + + // descale_fxp inputs + __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input + __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input + __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result + __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result + __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size + __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size // Similar to bindIn, but for the output. Output of a node is a struct, and // we consider the fields in increasing ordering. - __hpvm__bindOut(DsNode, 0, 0, 0); + __visc__bindOut(DsNode, 0, 0, 0); + } -int main(int argc, char *argv[]) { - // Parse the arguments. - arguments args; - set_default_args(&args); - argp_parse(&parser, argc, argv, 0, 0, &args); - - // Read a raw image. - // NOTE: We deliberately perform this file I/O outside of the kernel. - printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); - size_t row_size, col_size; - uint8_t *image_in = - read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); - - printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); - - // Allocate a buffer for storing the output image data. - // (This is currently the same size as the input image data.) - size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; - size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; - uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image); - - __hpvm__init(); - - /////////////////////////////////////////////////////////////// - // Camera Model Parameters - /////////////////////////////////////////////////////////////// - // Path to the camera model to be used - // char cam_model_path[100]; - // char cam_model_path = "cam_models/NikonD7000/"; - // White balance index (select white balance from transform file) - // The first white balance in the file has a wb_index of 1 - // For more information on model format see the readme - int wb_index = 6; - - // Number of control points - int num_ctrl_pts = 3702; - uint8_t *input, *result; - float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, - *transform_out, *gamut_out; - float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; - - TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); - float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); - free(TsTw); - TsTw = trans; - ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); - weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); - coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); - tone_map = get_tone_map("cam_models/NikonD7000/"); - - input_scaled = (float *)malloc_aligned(bytes_fimage); - result_scaled = (float *)malloc_aligned(bytes_fimage); - demosaic_out = (float *)malloc_aligned(bytes_fimage); - denoise_out = (float *)malloc_aligned(bytes_fimage); - transform_out = (float *)malloc_aligned(bytes_fimage); - gamut_out = (float *)malloc_aligned(bytes_fimage); - l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts); - - // This is host_input in cam_pipe() - input = (uint8_t *)malloc_aligned(bytes_image); - convert_hwc_to_chw(image_in, row_size, col_size, &input); - - // This is host_result in cam_pipe() - result = (uint8_t *)malloc_aligned(bytes_image); - - // Allocate struct to pass DFG inputs - RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn)); - - // Set up HPVM DFG inputs in the rootArgs struct. - rootArgs->input = input; - rootArgs->bytes_input = bytes_image; - - rootArgs->result = result; - rootArgs->bytes_result = bytes_image; - - rootArgs->input_scaled = input_scaled; - rootArgs->bytes_input_scaled = bytes_fimage; - - rootArgs->result_scaled = result_scaled; - rootArgs->bytes_result_scaled = bytes_fimage; - - rootArgs->demosaic_out = demosaic_out; - rootArgs->bytes_demosaic_out = bytes_fimage; - - rootArgs->denoise_out = denoise_out; - rootArgs->bytes_denoise_out = bytes_fimage; - - rootArgs->transform_out = transform_out; - rootArgs->bytes_transform_out = bytes_fimage; - - rootArgs->gamut_out = gamut_out; - rootArgs->bytes_gamut_out = bytes_fimage; - - rootArgs->TsTw = TsTw; - rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); - - rootArgs->ctrl_pts = ctrl_pts; - rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->weights = weights; - rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); - - rootArgs->coefs = coefs; - rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); - - rootArgs->tone_map = tone_map; - rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); - - rootArgs->l2_dist = l2_dist; - rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); - - rootArgs->row_size = row_size; - rootArgs->col_size = col_size; - - // Memory tracking is required for pointer arguments. - // Nodes can be scheduled on different targets, and - // dataflow edge implementation needs to request data. - // The pair (pointer, size) is inserted in memory tracker using this call - llvm_hpvm_track_mem(input, bytes_image); - llvm_hpvm_track_mem(result, bytes_image); - llvm_hpvm_track_mem(input_scaled, bytes_fimage); - llvm_hpvm_track_mem(result_scaled, bytes_fimage); - llvm_hpvm_track_mem(demosaic_out, bytes_fimage); - llvm_hpvm_track_mem(denoise_out, bytes_fimage); - llvm_hpvm_track_mem(transform_out, bytes_fimage); - llvm_hpvm_track_mem(gamut_out, bytes_fimage); - llvm_hpvm_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); - llvm_hpvm_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_hpvm_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); - llvm_hpvm_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float)); - llvm_hpvm_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); - llvm_hpvm_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); - - printf("\n\nLaunching CAVA pipeline!\n"); - - void *camPipeDFG = __hpvm__launch(0, CamPipeRoot, (void *)rootArgs); - __hpvm__wait(camPipeDFG); - - printf("\n\nPipeline execution completed!\n"); - printf("Pipeline final stage returned %lu; should be %lu\n", - rootArgs->ret.bytesRet, bytes_image); - printf("\n\nRequesting memory!\n"); - - // Request data from graph. - llvm_hpvm_request_mem(result, bytes_image); - llvm_hpvm_request_mem(demosaic_out, bytes_fimage); - llvm_hpvm_request_mem(denoise_out, bytes_fimage); - llvm_hpvm_request_mem(transform_out, bytes_fimage); - llvm_hpvm_request_mem(gamut_out, bytes_fimage); - printf("\n\nDone requesting memory!\n"); - - uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image); - uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image); - - descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, - row_size, col_size); - descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, - row_size, col_size); - descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, - row_size, col_size); - descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, - row_size, col_size); - - convert_chw_to_hwc(result, row_size, col_size, &image_out); - convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); - convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, - &image_out_demosaic); - convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, - &image_out_denoise); - convert_chw_to_hwc(transform_out_descaled, row_size, col_size, - &image_out_transform); - - // Remove tracked pointers. - llvm_hpvm_untrack_mem(input); - llvm_hpvm_untrack_mem(result); - llvm_hpvm_untrack_mem(input_scaled); - llvm_hpvm_untrack_mem(result_scaled); - llvm_hpvm_untrack_mem(demosaic_out); - llvm_hpvm_untrack_mem(denoise_out); - llvm_hpvm_untrack_mem(transform_out); - llvm_hpvm_untrack_mem(gamut_out); - - llvm_hpvm_untrack_mem(TsTw); - llvm_hpvm_untrack_mem(ctrl_pts); - llvm_hpvm_untrack_mem(weights); - llvm_hpvm_untrack_mem(coefs); - llvm_hpvm_untrack_mem(tone_map); - llvm_hpvm_untrack_mem(l2_dist); - - // Output the image. - // NOTE: We deliberately perform this file I/O outside of the kernel. +int main(int argc, char* argv[]) { + // Parse the arguments. + arguments args; + set_default_args(&args); + argp_parse(&parser, argc, argv, 0, 0, &args); + + // Read a raw image. + // NOTE: We deliberately perform this file I/O outside of the kernel. + printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); + size_t row_size, col_size; + uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); + + printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); + + // Allocate a buffer for storing the output image data. + // (This is currently the same size as the input image data.) + size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; + size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; + uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image); + + __visc__init(); + + /////////////////////////////////////////////////////////////// + // Camera Model Parameters + /////////////////////////////////////////////////////////////// + // Path to the camera model to be used +// char cam_model_path[100]; +// char cam_model_path = "cam_models/NikonD7000/"; + // White balance index (select white balance from transform file) + // The first white balance in the file has a wb_index of 1 + // For more information on model format see the readme + int wb_index = 6; + + // Number of control points + int num_ctrl_pts = 3702; + uint8_t *input, *result; + float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out; + float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; + + TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); + float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); + free(TsTw); + TsTw = trans; + ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); + weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); + coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); + tone_map = get_tone_map("cam_models/NikonD7000/"); + + input_scaled = (float*) malloc_aligned(bytes_fimage); + result_scaled = (float*) malloc_aligned(bytes_fimage); + demosaic_out = (float*) malloc_aligned(bytes_fimage); + denoise_out = (float*) malloc_aligned(bytes_fimage); + transform_out = (float*) malloc_aligned(bytes_fimage); + gamut_out = (float*) malloc_aligned(bytes_fimage); + l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); + + // This is host_input in cam_pipe() + input = (uint8_t*) malloc_aligned(bytes_image); + convert_hwc_to_chw(image_in, row_size, col_size, &input); + + // This is host_result in cam_pipe() + result = (uint8_t*) malloc_aligned(bytes_image); + + // Allocate struct to pass DFG inputs + RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = bytes_image; + + rootArgs->result = result; + rootArgs->bytes_result = bytes_image; + + rootArgs->input_scaled = input_scaled; + rootArgs->bytes_input_scaled = bytes_fimage; + + rootArgs->result_scaled = result_scaled; + rootArgs->bytes_result_scaled = bytes_fimage; + + rootArgs->demosaic_out = demosaic_out; + rootArgs->bytes_demosaic_out = bytes_fimage; + + rootArgs->denoise_out = denoise_out; + rootArgs->bytes_denoise_out = bytes_fimage; + + rootArgs->transform_out = transform_out; + rootArgs->bytes_transform_out = bytes_fimage; + + rootArgs->gamut_out = gamut_out; + rootArgs->bytes_gamut_out = bytes_fimage; + + rootArgs->TsTw = TsTw; + rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); + + rootArgs->ctrl_pts = ctrl_pts; + rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->weights = weights; + rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->coefs = coefs; + rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); + + rootArgs->tone_map = tone_map; + rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); + + rootArgs->l2_dist = l2_dist; + rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); + + rootArgs->row_size = row_size; + rootArgs->col_size = col_size; + + // Memory tracking is required for pointer arguments. + // Nodes can be scheduled on different targets, and + // dataflow edge implementation needs to request data. + // The pair (pointer, size) is inserted in memory tracker using this call + llvm_visc_track_mem(input, bytes_image); + llvm_visc_track_mem(result, bytes_image); + llvm_visc_track_mem(input_scaled, bytes_fimage); + llvm_visc_track_mem(result_scaled, bytes_fimage); + llvm_visc_track_mem(demosaic_out, bytes_fimage); + llvm_visc_track_mem(denoise_out, bytes_fimage); + llvm_visc_track_mem(transform_out, bytes_fimage); + llvm_visc_track_mem(gamut_out, bytes_fimage); + llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float)); + llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); + + printf("\n\nLaunching CAVA pipeline!\n"); + + void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs); + __visc__wait(camPipeDFG); + + printf("\n\nPipeline execution completed!\n"); + printf("\n\nRequesting memory!\n"); + + // Request data from graph. + llvm_visc_request_mem(result, bytes_image); + llvm_visc_request_mem(demosaic_out, bytes_fimage); + llvm_visc_request_mem(denoise_out, bytes_fimage); + llvm_visc_request_mem(transform_out, bytes_fimage); + llvm_visc_request_mem(gamut_out, bytes_fimage); + printf("\n\nDone requesting memory!\n"); + + + uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + + descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size); + descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size); + descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size); + descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size); + + convert_chw_to_hwc(result, row_size, col_size, &image_out); + convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); + convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic); + convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise); + convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform); + + + // Remove tracked pointers. + llvm_visc_untrack_mem(input); + llvm_visc_untrack_mem(result); + llvm_visc_untrack_mem(input_scaled); + llvm_visc_untrack_mem(result_scaled); + llvm_visc_untrack_mem(demosaic_out); + llvm_visc_untrack_mem(denoise_out); + llvm_visc_untrack_mem(transform_out); + llvm_visc_untrack_mem(gamut_out); + + llvm_visc_untrack_mem(TsTw); + llvm_visc_untrack_mem(ctrl_pts); + llvm_visc_untrack_mem(weights); + llvm_visc_untrack_mem(coefs); + llvm_visc_untrack_mem(tone_map); + llvm_visc_untrack_mem(l2_dist); + + // Output the image. + // NOTE: We deliberately perform this file I/O outside of the kernel. char str[50], base_str[50]; strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]); strcpy(str, base_str); @@ -912,7 +877,8 @@ int main(int argc, char *argv[]) { printf("Writing output image to %s\n", str); write_image_to_binary(str, image_out_transform, row_size, col_size); - __hpvm__cleanup(); + __visc__cleanup(); - return 0; + return 0; } + diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c index 05bb06697f..2ebedec936 100644 --- a/hpvm/test/hpvm-cava/src/pipe_stages.c +++ b/hpvm/test/hpvm-cava/src/pipe_stages.c @@ -1,169 +1,172 @@ +#include <stdio.h> +#include <math.h> #include "pipe_stages.h" #include "cam_pipe_utility.h" -#include <math.h> -#include <stdio.h> - -// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { -void scale_fxp(uint8_t *input, size_t bytes_input, float *output, - size_t bytes_output, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(2, input, output, 1, output); +//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { +void scale_fxp(uint8_t *input, size_t bytes_input, + float *output, size_t bytes_output, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, output, 1, output); + ARRAY_3D(uint8_t, _input, input, row_size, col_size); ARRAY_3D(float, _output, output, row_size, col_size); -sl_chan: + sl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - sl_row: + sl_row: for (int row = 0; row < row_size; row++) - sl_col: + sl_col: for (int col = 0; col < col_size; col++) _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255; - __hpvm__return(1, bytes_output); + __visc__return(1, bytes_output); } -// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { -void descale_fxp(float *input, size_t bytes_input, uint8_t *output, - size_t bytes_result, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(2, input, output, 1, output); - +//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { +void descale_fxp(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, output, 1, output); + ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(uint8_t, _output, output, row_size, col_size); -dsl_chan: + dsl_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dsl_row: + dsl_row: for (int row = 0; row < row_size; row++) - dsl_col: + dsl_col: for (int col = 0; col < col_size; col++) - _output[chan][row][col] = - min(max(_input[chan][row][col] * 255, 0), 255); + _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255); - __hpvm__return(1, bytes_output); + __visc__return(1, bytes_output); } // Demosaicing stage // G R // B G -// void demosaic_fxp(float *input, int row_size, int col_size, float *result) { -void demosaic_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(2, input, result, 1, result); - +//void demosaic_fxp(float *input, int row_size, int col_size, float *result) { +void demosaic_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + printf("Demosaicing.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); -dm_row: + dm_row: for (int row = 1; row < row_size - 1; row++) - dm_col: + dm_col: for (int col = 1; col < col_size - 1; col++) - if (row % 2 == 0 && col % 2 == 0) { - // Green pixel - // Getting the R values - float R1 = _input[0][row][col - 1]; - float R2 = _input[0][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col]; - float B2 = _input[2][row + 1][col]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } else if (row % 2 == 0 && col % 2 == 1) { - // Red pixel - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // Getting the B values - float B1 = _input[2][row - 1][col - 1]; - float B2 = _input[2][row - 1][col + 1]; - float B3 = _input[2][row + 1][col - 1]; - float B4 = _input[2][row + 1][col + 1]; - // R - _result[0][row][col] = _input[0][row][col]; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B (center pixel) - _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; - } else if (row % 2 == 1 && col % 2 == 0) { - // Blue pixel - // Getting the R values - float R1 = _input[0][row - 1][col - 1]; - float R2 = _input[0][row + 1][col - 1]; - float R3 = _input[0][row - 1][col + 1]; - float R4 = _input[0][row + 1][col + 1]; - // Getting the G values - float G1 = _input[1][row - 1][col]; - float G2 = _input[1][row + 1][col]; - float G3 = _input[1][row][col - 1]; - float G4 = _input[1][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; - // G - _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; - // B - _result[2][row][col] = _input[2][row][col]; - } else { - // Bottom Green pixel - // Getting the R values - float R1 = _input[0][row - 1][col]; - float R2 = _input[0][row + 1][col]; - // Getting the B values - float B1 = _input[2][row][col - 1]; - float B2 = _input[2][row][col + 1]; - // R - _result[0][row][col] = (R1 + R2) / 2; - // G - _result[1][row][col] = _input[1][row][col] * 2; - // B - _result[2][row][col] = (B1 + B2) / 2; - } + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = _input[0][row][col - 1]; + float R2 = _input[0][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col]; + float B2 = _input[2][row + 1][col]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col - 1]; + float B2 = _input[2][row - 1][col + 1]; + float B3 = _input[2][row + 1][col - 1]; + float B4 = _input[2][row + 1][col + 1]; + // R + _result[0][row][col] = _input[0][row][col]; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = _input[0][row - 1][col - 1]; + float R2 = _input[0][row + 1][col - 1]; + float R3 = _input[0][row - 1][col + 1]; + float R4 = _input[0][row + 1][col + 1]; + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B + _result[2][row][col] = _input[2][row][col]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = _input[0][row - 1][col]; + float R2 = _input[0][row + 1][col]; + // Getting the B values + float B1 = _input[2][row][col - 1]; + float B2 = _input[2][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } static void sort(float arr[], int n) { - int i, j; -dn_sort_i: - for (i = 0; i < n - 1; i++) - dn_sort_j: - for (j = 0; j < n - i - 1; j++) - if (arr[j] > arr[j + 1]) { - float temp = arr[j]; - arr[j] = arr[j + 1]; - arr[j + 1] = temp; - } + int i, j; + dn_sort_i: + for (i = 0; i < n - 1; i++) + dn_sort_j: + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } } // Simple denoise -// void denoise_fxp(float *input, int row_size, int col_size, float *result) { -void denoise_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(2, input, result, 1, result); - +//void denoise_fxp(float *input, int row_size, int col_size, float *result) { +void denoise_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + printf("Denoising.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); -dn_chan: + dn_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - dn_row: + dn_row: for (int row = 0; row < row_size; row++) - dn_col: + dn_col: for (int col = 0; col < col_size; col++) if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { float filter[9]; - dn_slide_row: - for (int i = row - 1; i < row + 2; i++) - dn_slide_col: - for (int j = col - 1; j < col + 2; j++) { + dn_slide_row: + for (int i = row-1; i < row+2; i++) + dn_slide_col: + for (int j = col-1; j < col+2; j++) { int index = (i - row + 1) * 3 + j - col + 1; filter[index] = _input[chan][i][j]; } @@ -172,52 +175,53 @@ dn_chan: } else { _result[chan][row][col] = _input[chan][row][col]; } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } // Color map and white balance transform -// void transform_fxp(float *input, int row_size, int col_size, float *result, +//void transform_fxp(float *input, int row_size, int col_size, float *result, // float *TsTw_tran) { -void transform_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw, +void transform_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(3, input, result, TsTw_tran, 1, result); - + __visc__hint(DEVICE); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + printf("Color mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3); -tr_chan: + tr_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tr_row: + tr_row: for (int row = 0; row < row_size; row++) - tr_col: + tr_col: for (int col = 0; col < col_size; col++) _result[chan][row][col] = max(_input[0][row][col] * _TsTw_tran[0][chan] + _input[1][row][col] * _TsTw_tran[1][chan] + _input[2][row][col] * _TsTw_tran[2][chan], 0); - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } // // Weighted radial basis function for gamut mapping // -// void gamut_map_fxp(float *input, int row_size, int col_size, float *result, -// float *ctrl_pts, float *weights, float *coefs, float -// *l2_dist) { -void gamut_map_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts, - float *weights, size_t bytes_weights, float *coefs, - size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist, +//void gamut_map_fxp(float *input, int row_size, int col_size, float *result, +// float *ctrl_pts, float *weights, float *coefs, float *l2_dist) { +void gamut_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, - result); - + __visc__hint(DEVICE); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); + printf("Gamut mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); @@ -225,25 +229,26 @@ void gamut_map_fxp(float *input, size_t bytes_input, float *result, ARRAY_2D(float, _weights, weights, 3); ARRAY_2D(float, _coefs, coefs, 3); -// First, get the L2 norm from every pixel to the control points, -// Then, sum it and weight it. Finally, add the bias. -gm_rbf_row: + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + gm_rbf_row: for (int row = 0; row < row_size; row++) - gm_rbf_col: + gm_rbf_col: for (int col = 0; col < col_size; col++) { - gm_rbf_cp0: + gm_rbf_cp0: for (int cp = 0; cp < num_ctrl_pts; cp++) { - l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * - (_input[0][row][col] - _ctrl_pts[cp][0]) + - (_input[1][row][col] - _ctrl_pts[cp][1]) * - (_input[1][row][col] - _ctrl_pts[cp][1]) + - (_input[2][row][col] - _ctrl_pts[cp][2]) * - (_input[2][row][col] - _ctrl_pts[cp][2])); + l2_dist[cp] = + sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * + (_input[0][row][col] - _ctrl_pts[cp][0]) + + (_input[1][row][col] - _ctrl_pts[cp][1]) * + (_input[1][row][col] - _ctrl_pts[cp][1]) + + (_input[2][row][col] - _ctrl_pts[cp][2]) * + (_input[2][row][col] - _ctrl_pts[cp][2])); } - gm_rbf_chan: + gm_rbf_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) { float chan_val = 0.0; - gm_rbf_cp1: + gm_rbf_cp1: for (int cp = 0; cp < num_ctrl_pts; cp++) { chan_val += l2_dist[cp] * _weights[cp][chan]; } @@ -254,31 +259,32 @@ gm_rbf_row: _result[chan][row][col] = max(chan_val, 0); } } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } // Tone mapping -// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, +//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, // float *result) { -void tone_map_fxp(float *input, size_t bytes_input, float *result, - size_t bytes_result, float *tone_map, size_t bytes_tone_map, +void tone_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, int row_size, int col_size) { - __hpvm__hint(DEVICE); - __hpvm__attributes(3, input, result, tone_map, 1, result); - + __visc__hint(DEVICE); + __visc__attributes(3, input, result, tone_map, 1, result); + printf("Tone mapping.\n"); ARRAY_3D(float, _input, input, row_size, col_size); ARRAY_3D(float, _result, result, row_size, col_size); ARRAY_2D(float, _tone_map, tone_map, 3); -tm_chan: + tm_chan: for (int chan = 0; chan < CHAN_SIZE; chan++) - tm_row: + tm_row: for (int row = 0; row < row_size; row++) - tm_col: + tm_col: for (int col = 0; col < col_size; col++) { uint8_t x = _input[chan][row][col] * 255; _result[chan][row][col] = _tone_map[x][chan]; } - __hpvm__return(1, bytes_result); + __visc__return(1, bytes_result); } diff --git a/hpvm/test/include/hpvm.h b/hpvm/test/include/hpvm.h deleted file mode 100644 index 1e31c98946..0000000000 --- a/hpvm/test/include/hpvm.h +++ /dev/null @@ -1,73 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -#ifndef DEVICE -#define DEVICE GPU_TARGET -#endif - -#include "../../include/SupportHPVM/HPVMHint.h" - -#ifndef __cplusplus -#define noexcept -#endif - -#ifdef __cplusplus -extern "C" { -void __hpvm__hint(hpvm::Target) noexcept; -#else -void __hpvm__hint(enum Target) noexcept; -#endif - -void *__hpvm__createNodeND(unsigned, ...) noexcept; -void __hpvm__return(unsigned, ...) noexcept; - -void __hpvm__attributes(unsigned, ...) noexcept; -void __hpvm__init() noexcept; -void __hpvm__cleanup() noexcept; - -void __hpvm__bindIn(void *, unsigned, unsigned, unsigned) noexcept; -void __hpvm__bindOut(void *, unsigned, unsigned, unsigned) noexcept; -void *__hpvm__edge(void *, void *, unsigned, unsigned, unsigned, - unsigned) noexcept; - -void __hpvm__push(void *, void *) noexcept; -void *__hpvm__pop(void *) noexcept; -void *__hpvm__launch(unsigned, ...) noexcept; -void __hpvm__wait(void *) noexcept; - -void *__hpvm__getNode() noexcept; -void *__hpvm__getParentNode(void *) noexcept; -void __hpvm__barrier() noexcept; -void *__hpvm__malloc(long) noexcept; -long __hpvm__getNodeInstanceID_x(void *) noexcept; -long __hpvm__getNodeInstanceID_y(void *) noexcept; -long __hpvm__getNodeInstanceID_z(void *) noexcept; -long __hpvm__getNumNodeInstances_x(void *) noexcept; -long __hpvm__getNumNodeInstances_y(void *) noexcept; -long __hpvm__getNumNodeInstances_z(void *) noexcept; - -// Atomic -// signed int -int __hpvm__atomic_add(int *, int) noexcept; -int __hpvm__atomic_sub(int *, int) noexcept; -int __hpvm__atomic_xchg(int *, int) noexcept; -int __hpvm__atomic_inc(int *) noexcept; -int __hpvm__atomic_dec(int *) noexcept; -int __hpvm__atomic_min(int *, int) noexcept; -int __hpvm__atomic_max(int *, int) noexcept; -int __hpvm__atomic_and(int *, int) noexcept; -int __hpvm__atomic_or(int *, int) noexcept; -int __hpvm__atomic_xor(int *, int) noexcept; - -void llvm_hpvm_track_mem(void *, size_t) noexcept; -void llvm_hpvm_untrack_mem(void *) noexcept; -void llvm_hpvm_request_mem(void *, size_t) noexcept; - -#ifdef __cplusplus -} -#endif diff --git a/hpvm/test/include/visc.h b/hpvm/test/include/visc.h new file mode 100644 index 0000000000..18b2950026 --- /dev/null +++ b/hpvm/test/include/visc.h @@ -0,0 +1,73 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef DEVICE +#define DEVICE GPU_TARGET +#endif + +#include "../../include/SupportVISC/VISCHint.h" + +#ifndef __cplusplus +#define noexcept +#endif + +#ifdef __cplusplus +extern "C" { +void __visc__hint(visc::Target) noexcept; +#else +void __visc__hint(enum Target) noexcept; +#endif + +void *__visc__createNodeND(unsigned, ...) noexcept; +void __visc__return(unsigned, ...) noexcept; + +void __visc__attributes(unsigned, ...) noexcept; +void __visc__init() noexcept; +void __visc__cleanup() noexcept; + +void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept; +void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept; +void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, + unsigned) noexcept; + +void __visc__push(void *, void *) noexcept; +void *__visc__pop(void *) noexcept; +void *__visc__launch(unsigned, ...) noexcept; +void __visc__wait(void *) noexcept; + +void *__visc__getNode() noexcept; +void *__visc__getParentNode(void *) noexcept; +void __visc__barrier() noexcept; +void *__visc__malloc(long) noexcept; +long __visc__getNodeInstanceID_x(void *) noexcept; +long __visc__getNodeInstanceID_y(void *) noexcept; +long __visc__getNodeInstanceID_z(void *) noexcept; +long __visc__getNumNodeInstances_x(void *) noexcept; +long __visc__getNumNodeInstances_y(void *) noexcept; +long __visc__getNumNodeInstances_z(void *) noexcept; + +// Atomic +// signed int +int __visc__atomic_add(int *, int) noexcept; +int __visc__atomic_sub(int *, int) noexcept; +int __visc__atomic_xchg(int *, int) noexcept; +int __visc__atomic_inc(int *) noexcept; +int __visc__atomic_dec(int *) noexcept; +int __visc__atomic_min(int *, int) noexcept; +int __visc__atomic_max(int *, int) noexcept; +int __visc__atomic_and(int *, int) noexcept; +int __visc__atomic_or(int *, int) noexcept; +int __visc__atomic_xor(int *, int) noexcept; + +void llvm_visc_track_mem(void *, size_t) noexcept; +void llvm_visc_untrack_mem(void *) noexcept; +void llvm_visc_request_mem(void *, size_t) noexcept; + +#ifdef __cplusplus +} +#endif diff --git a/hpvm/test/parboil/RUN.parboil.script b/hpvm/test/parboil/RUN.parboil.script index 5cedcf480d..7f8c01ede7 100644 --- a/hpvm/test/parboil/RUN.parboil.script +++ b/hpvm/test/parboil/RUN.parboil.script @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll parboil.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin diff --git a/hpvm/test/parboil/benchmarks/bfs/Makefile b/hpvm/test/parboil/benchmarks/bfs/Makefile index e40a8484a3..cc6db67829 100644 --- a/hpvm/test/parboil/benchmarks/bfs/Makefile +++ b/hpvm/test/parboil/benchmarks/bfs/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = bfs -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll index aca5667b70..9abdb29a3c 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/bfs/src/opencl_base/kernel.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/bfs/src/opencl_base/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp index 8e0d34c4b8..9b8b502688 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp @@ -237,7 +237,7 @@ int main(int argc, char **argv) { NULL, NULL)); printf("Starting GPU kernel\n"); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int num_of_blocks; int num_of_threads_per_block; @@ -272,7 +272,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); if (num_t == 0) { // frontier is empty break; } diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp index cfd0bf870a..3f9bc77557 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp @@ -428,7 +428,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg( BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -458,7 +458,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, 0, grid, block, 0, 0, 0)); @@ -490,7 +490,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL( clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -542,7 +542,7 @@ int main(int argc, char **argv) { OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); @@ -572,7 +572,7 @@ int main(int argc, char **argv) { clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL)); OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL)); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1, 0, grid, block, 0, 0, 0)); OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); diff --git a/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile similarity index 81% rename from hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile index 27cde148f7..a459707110 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS= -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h b/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h similarity index 100% rename from hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h rename to hpvm/test/parboil/benchmarks/bfs/src/visc/config.h diff --git a/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp similarity index 70% rename from hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp rename to hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp index 0fa9a60df8..9491218e5e 100644 --- a/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp +++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp @@ -26,11 +26,11 @@ */ #include "config.h" #include "parboil.h" -#include <hpvm.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <visc.h> /********** Define colors for BFS @@ -113,11 +113,11 @@ void packData(RootIn *args, int *q1, size_t bytesq1, int *q2, size_t bytesq2, void Allocation(long block) { // Memory shared between threadblocks - void *local_q_tail = __hpvm__malloc(sizeof(int)); - void *local_q = __hpvm__malloc(LOCAL_MEM_SIZE * sizeof(int)); - void *shift = __hpvm__malloc(sizeof(int)); + void *local_q_tail = __visc__malloc(sizeof(int)); + void *local_q = __visc__malloc(LOCAL_MEM_SIZE * sizeof(int)); + void *shift = __visc__malloc(sizeof(int)); - __hpvm__return(6, local_q_tail, sizeof(int), local_q, + __visc__return(6, local_q_tail, sizeof(int), local_q, LOCAL_MEM_SIZE * sizeof(int), shift, sizeof(int)); } @@ -133,21 +133,21 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int *local_q_tail, size_t byteslocal_q_tail, int *local_q, size_t byteslocal_q, int *shift, size_t bytesshift) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __visc__hint(visc::DEVICE); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int dimx = __hpvm__getNumNodeInstances_x(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int dimx = __visc__getNumNodeInstances_x(thisNode); if (lx == 0) { *local_q_tail = 0; // initialize the tail of w-queue } - __hpvm__barrier(); + __visc__barrier(); // first, propagate and add the new frontier elements into w-queues // int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0); @@ -170,16 +170,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int cost = cur_edge.y; cost += cur_cost; - int orig_cost = __hpvm__atomic_min(&g_cost[id], cost); + int orig_cost = __visc__atomic_min(&g_cost[id], cost); if (orig_cost > cost) { // the node should be visited if (g_color[id] > UP_LIMIT) { - int old_color = __hpvm__atomic_xchg(&g_color[id], gray_shade); + int old_color = __visc__atomic_xchg(&g_color[id], gray_shade); // this guarantees that only one thread will push this node // into a queue if (old_color != gray_shade) { // atomic operation guarantees the correctness // even if multiple warps are executing simultaneously - int index = __hpvm__atomic_add(local_q_tail, 1); + int index = __visc__atomic_add(local_q_tail, 1); local_q[index] = id; } } @@ -187,16 +187,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2, } } - __hpvm__barrier(); + __visc__barrier(); if (lx == 0) { int tot_sum = *local_q_tail; // the offset or "shift" of the block-level queue within the grid-level // queue is determined by atomic operation - *shift = __hpvm__atomic_add(tail, tot_sum); + *shift = __visc__atomic_add(tail, tot_sum); } - __hpvm__barrier(); + __visc__barrier(); // shift within a w-queue int local_shift = lx; @@ -220,41 +220,41 @@ void BlockingBFS(int *q1, size_t bytesq1, int *q2, size_t bytesq2, // ideally be placed in local memory int *local_q_tail, size_t byteslocal_q_tail, int *local_q, size_t byteslocal_q, int *shift, size_t bytesshift) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *AllocationNode = __hpvm__createNodeND(0, Allocation); - void *BFSLeafNode = __hpvm__createNodeND(1, BFSLeaf, block); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block); // Bind edges - __hpvm__bindIn(AllocationNode, 17, 0, 0); // Bind block - __hpvm__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 - __hpvm__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 - __hpvm__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 - __hpvm__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 - __hpvm__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes - __hpvm__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes - __hpvm__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges - __hpvm__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges - __hpvm__bindIn(BFSLeafNode, 8, 8, 0); // Bind color - __hpvm__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color - __hpvm__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost - __hpvm__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost - __hpvm__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail - __hpvm__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail - __hpvm__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes - __hpvm__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade - __hpvm__bindIn(BFSLeafNode, 16, 16, 0); // Bind k + __visc__bindIn(AllocationNode, 17, 0, 0); // Bind block + __visc__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BFSLeafNode, 8, 8, 0); // Bind color + __visc__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost + __visc__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost + __visc__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail + __visc__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail + __visc__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes + __visc__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade + __visc__bindIn(BFSLeafNode, 16, 16, 0); // Bind k // Create Edges between AllocationNode and BFSLeafNodeNode - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 1, 18, + __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail + __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18, 0); // Edge bytes_local_q_tail - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift - __hpvm__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift + __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q + __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q + __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift + __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift } // VoidRetTy @@ -264,30 +264,30 @@ void BFS_Root(int *q1, size_t bytesq1, int *q2, size_t bytesq2, int *g_color, size_t bytesg_color, int *g_cost, size_t bytesg_cost, int *tail, size_t bytestail, int no_of_nodes, int gray_shade, int k, long block, long grid) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *BlockingBFSNode = __hpvm__createNodeND(1, BlockingBFS, grid); + void *BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid); // Bind edges - __hpvm__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __hpvm__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __hpvm__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __hpvm__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __hpvm__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __hpvm__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __hpvm__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __hpvm__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __hpvm__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __hpvm__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color - __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost - __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost - __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail - __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail - __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes - __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade - __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k - __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost + __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost + __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail + __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail + __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes + __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade + __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k + __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block } void BFS_Wrapper(int *q1, size_t bytesq1, // 0, 1 @@ -300,31 +300,31 @@ void BFS_Wrapper(int *q1, size_t bytesq1, // 0, 1 int no_of_nodes, int gray_shade, // 14, 15 int k, long block, long grid // 16 - 18 ) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail, 4, q2, g_color, g_cost, tail); - void *BlockingBFSNode = __hpvm__createNodeND(0, BFS_Root); + void *BlockingBFSNode = __visc__createNodeND(0, BFS_Root); // Bind edges - __hpvm__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 - __hpvm__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 - __hpvm__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 - __hpvm__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 - __hpvm__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes - __hpvm__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes - __hpvm__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges - __hpvm__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges - __hpvm__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color - __hpvm__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color - __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost - __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost - __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail - __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail - __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes - __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade - __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k - __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block - __hpvm__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid + __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1 + __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1 + __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2 + __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 + __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes + __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes + __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges + __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges + __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color + __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color + __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost + __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost + __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail + __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail + __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes + __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade + __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k + __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block + __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid } FILE *fp; @@ -415,7 +415,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // allocate mem for the result on host side @@ -433,15 +433,15 @@ int main(int argc, char **argv) { int *q2 = (int *)malloc(sizeof(int) * num_of_nodes); int *tail = (int *)malloc(sizeof(int)); - llvm_hpvm_track_mem(graph_nodes, bytes_graph_nodes); - llvm_hpvm_track_mem(graph_edges, bytes_graph_edges); - llvm_hpvm_track_mem(cost, bytes_cost); - llvm_hpvm_track_mem(color, bytes_cost); + llvm_visc_track_mem(graph_nodes, bytes_graph_nodes); + llvm_visc_track_mem(graph_edges, bytes_graph_edges); + llvm_visc_track_mem(cost, bytes_cost); + llvm_visc_track_mem(color, bytes_cost); // Allocating stuff on host side, but these can also be allocated in the graph - llvm_hpvm_track_mem(q1, bytes_cost); - llvm_hpvm_track_mem(q2, bytes_cost); + llvm_visc_track_mem(q1, bytes_cost); + llvm_visc_track_mem(q2, bytes_cost); // Scalar variable read/written by both graph and host. - llvm_hpvm_track_mem(tail, sizeof(int)); + llvm_visc_track_mem(tail, sizeof(int)); int num_of_blocks; int num_of_threads_per_block; @@ -466,9 +466,9 @@ int main(int argc, char **argv) { graph_edges, bytes_graph_edges, color, bytes_cost, cost, bytes_cost, tail, sizeof(int), num_of_nodes, gray, k, block, grid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); do { - llvm_hpvm_request_mem(tail, sizeof(int)); + llvm_visc_request_mem(tail, sizeof(int)); num_t = *tail; // printf("tail for iteration %d = %d\n",k, num_t); *tail = 0; @@ -493,7 +493,7 @@ int main(int argc, char **argv) { } else { args->gray_shade = GRAY1; } - // void* bfsDFG = __hpvm__node(BFS_kernel, 2, 1, block, grid, 17, + // void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17, // q1, bytes_cost, // q2, bytes_cost, // graph_nodes, bytes_graph_nodes, @@ -505,8 +505,8 @@ int main(int argc, char **argv) { // gray, // k, // 0); - void *bfsDFG = __hpvm__launch(0, BFS_Wrapper, (void *)args); - __hpvm__wait(bfsDFG); + void *bfsDFG = __visc__launch(0, BFS_Wrapper, (void *)args); + __visc__wait(bfsDFG); // Swap q1 and q2 // Swap q1 and q2 int *temp = args->q1; @@ -518,22 +518,22 @@ int main(int argc, char **argv) { // copy result from device to host pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(cost, bytes_cost); - llvm_hpvm_request_mem(color, bytes_cost); + llvm_visc_request_mem(cost, bytes_cost); + llvm_visc_request_mem(color, bytes_cost); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - llvm_hpvm_untrack_mem(graph_nodes); - llvm_hpvm_untrack_mem(graph_edges); - llvm_hpvm_untrack_mem(cost); - llvm_hpvm_untrack_mem(color); - llvm_hpvm_untrack_mem(q1); - llvm_hpvm_untrack_mem(q2); - llvm_hpvm_untrack_mem(tail); + llvm_visc_untrack_mem(graph_nodes); + llvm_visc_untrack_mem(graph_edges); + llvm_visc_untrack_mem(cost); + llvm_visc_untrack_mem(color); + llvm_visc_untrack_mem(q1); + llvm_visc_untrack_mem(q2); + llvm_visc_untrack_mem(tail); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); // Store the result into a file // FIXME: color is not even printed. Why are we reading it back?? diff --git a/hpvm/test/parboil/benchmarks/cutcp/Makefile b/hpvm/test/parboil/benchmarks/cutcp/Makefile index e8edc6e731..5e56793360 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/Makefile +++ b/hpvm/test/parboil/benchmarks/cutcp/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = cutcp -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c index dfd7f1ff38..06f856c1a0 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c @@ -427,7 +427,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /*CHECK_ERROR("clCreateCommandQueue")*/ /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll similarity index 99% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll index 85a73b291f..7f614e66ff 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_hpvm.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_visc.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll index 5a3c1fcd5d..370e3c0f8f 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c index 076532b709..96ebeafbdf 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c +++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c @@ -423,7 +423,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /*CHECK_ERROR("clCreateCommandQueue")*/ /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) { diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile similarity index 85% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile index 43a175b947..d4c650a17e 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=excl.ll cutcpu.ll cutoff6overlap.ll output.ll readatom.ll ocl.ll -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl b/hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp similarity index 82% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp index 0a36196619..caf99a5b37 100644 --- a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp +++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp @@ -16,7 +16,7 @@ #include "cutoff.h" #include "macros.h" #include "output.h" -#include <hpvm.h> +#include <visc.h> #define ERRTOL 1e-4f @@ -54,11 +54,11 @@ extern float rsqrt(float x); void Allocation(long block) { // Memory shared between threadblocks size_t bytes_AtomBinCache = sizeof(float) * BIN_CACHE_MAXLEN * BIN_DEPTH * 4; - void *AtomBinCache = __hpvm__malloc(bytes_AtomBinCache); + void *AtomBinCache = __visc__malloc(bytes_AtomBinCache); size_t bytes_myBinIndex = sizeof(xyz); - void *myBinIndex = __hpvm__malloc(bytes_myBinIndex); - __hpvm__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, + void *myBinIndex = __visc__malloc(bytes_myBinIndex); + __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, bytes_myBinIndex); } @@ -76,21 +76,21 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, // local memory args float *AtomBinCache, size_t bytes_AtomBinCache, int *myBinIndex, size_t bytes_myBinIndex) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __visc__hint(visc::DEVICE); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int ly = __hpvm__getNodeInstanceID_y(thisNode); - int lz = __hpvm__getNodeInstanceID_z(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int gy = __hpvm__getNodeInstanceID_y(parentNode); - int dimx = __hpvm__getNumNodeInstances_x(thisNode); - int dimy = __hpvm__getNumNodeInstances_y(thisNode); - int gdimx = __hpvm__getNumNodeInstances_x(parentNode); - int gdimy = __hpvm__getNumNodeInstances_y(parentNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int lz = __visc__getNodeInstanceID_z(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int dimx = __visc__getNumNodeInstances_x(thisNode); + int dimy = __visc__getNumNodeInstances_y(thisNode); + int gdimx = __visc__getNumNodeInstances_x(parentNode); + int gdimy = __visc__getNumNodeInstances_y(parentNode); float *binZeroAddr = binBaseAddr + 4 * offset; @@ -168,7 +168,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, AtomBinCache[binIndex + tidmask + 16] = p_global[tidmask + 16]; } - __hpvm__barrier(); + __visc__barrier(); /* no warp divergence */ if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) { numbins = *NbrListLen - totalbins; @@ -196,7 +196,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy0 += aq * rsqrt(r2) * s * s; - // energy0 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; + // energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else energy0 += (r2 < cutoff2); @@ -208,7 +208,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy1 += aq * rsqrt(r2) * s * s; - // energy1 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; + // energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else energy1 += (r2 < cutoff2); @@ -219,7 +219,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, if (r2 < cutoff2) { float s = (1.f - r2 * inv_cutoff2); energy2 += aq * rsqrt(r2) * s * s; - // energy2 += aq * (1.0/__hpvm__sqrt(r2)) * s * s; + // energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s; } #else energy2 += (r2 < cutoff2); @@ -237,7 +237,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr, #endif } /* end loop over atoms in bin */ } /* end loop over cached atom bins */ - __hpvm__barrier(); + __visc__barrier(); } /* end loop over neighbor list */ /* store into global memory */ @@ -260,38 +260,38 @@ void BlockingCUTCP(int binDim_x, int binDim_y, float4 *binBaseAddr, size_t bytes_NbrList, long blockx, long blocky, long blockz) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *AllocationNode = __hpvm__createNodeND(0, Allocation); + void *AllocationNode = __visc__createNodeND(0, Allocation); void *CUTCPLeafNode = - __hpvm__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); + __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz); // Bind Inputs - __hpvm__bindIn(AllocationNode, 15, 0, 0); // Bind blockx - __hpvm__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x - __hpvm__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y - __hpvm__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr - __hpvm__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr - __hpvm__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset - __hpvm__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h - __hpvm__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 - __hpvm__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 - __hpvm__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr - __hpvm__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __hpvm__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex - __hpvm__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen - __hpvm__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen - __hpvm__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList - __hpvm__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx + __visc__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset + __visc__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h + __visc__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList // Create Edges - __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache - __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, 0); // Edge bytes_AtomBinCache - __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex - __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex + __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, 0); // Edge bytes_myBinIndex } @@ -370,32 +370,32 @@ void CUTCPRoot(int binDim_x, int binDim_y, float4 *binBaseAddr, int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, size_t bytes_NbrList, long blockx, long blocky, long blockz, long gridx, long gridy, long gridz) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); void *BlockingCUTCPNode = - __hpvm__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); + __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz); // Bind Inputs - __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz } void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr, @@ -410,34 +410,34 @@ void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr, int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList, size_t bytes_NbrList, long blockx, long blocky, long blockz, long gridx, long gridy, long gridz) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr); - void *BlockingCUTCPNode = __hpvm__createNodeND(0, CUTCPRoot); + void *BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot); // Bind Inputs - __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x - __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y - __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr - __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr - __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset - __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h - __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 - __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 - __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr - __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr - __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex - __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen - __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen - __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList - __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList - __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx - __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky - __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz - __hpvm__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx - __hpvm__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy - __hpvm__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz + __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x + __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y + __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr + __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset + __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h + __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2 + __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2 + __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr + __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex + __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen + __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen + __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList + __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList + __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx + __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky + __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz + __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx + __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy + __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz } // ==================== Host Code ============================== @@ -546,7 +546,7 @@ int main(int argc, char *argv[]) { } pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -586,7 +586,7 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); /* Print output */ // pb_SwitchToTimer(&timers, pb_TimerID_IO); @@ -873,11 +873,11 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("\n"); } - // Track hpvm data - llvm_hpvm_track_mem(regionZeroAddr, bytes_regionZeroAddr); - llvm_hpvm_track_mem(binBaseAddr, bytes_binBaseAddr); - llvm_hpvm_track_mem(nbrlistlen, sizeof(int)); - llvm_hpvm_track_mem(nbrlist, bytes_nbrlist); + // Track visc data + llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr); + llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr); + llvm_visc_track_mem(nbrlistlen, sizeof(int)); + llvm_visc_track_mem(nbrlist, bytes_nbrlist); /* setup OpenCL kernel parameters */ blockDim[0] = 8; @@ -914,7 +914,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( gridDim[1], gridDim[2]); /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */ - pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); void *CUTCP_DFG; if (verbose) printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim); @@ -926,9 +926,9 @@ int gpu_compute_cutoff_potential_lattice6overlap( args->zRegionIndex = zRegionIndex; - CUTCP_DFG = __hpvm__launch(0, CUTCPWrapper, (void *)args); - __hpvm__wait(CUTCP_DFG); - // llvm_hpvm_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); + CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void *)args); + __visc__wait(CUTCP_DFG); + // llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t)); } /* @@ -941,14 +941,14 @@ int gpu_compute_cutoff_potential_lattice6overlap( printf("computing extra atoms on CPU\n"); } - pb_SwitchToTimer(timers, hpvm_TimerID_MISC); + pb_SwitchToTimer(timers, visc_TimerID_MISC); if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) { fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed " "for extra atoms\n"); return -1; } - pb_SwitchToTimer(timers, hpvm_TimerID_MISC); + pb_SwitchToTimer(timers, visc_TimerID_MISC); printf("\n"); } if (verbose) @@ -957,7 +957,7 @@ int gpu_compute_cutoff_potential_lattice6overlap( /* copy result regions from OpenCL device */ pb_SwitchToTimer(timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(regionZeroAddr, lnall * sizeof(ener_t)); + llvm_visc_request_mem(regionZeroAddr, lnall * sizeof(ener_t)); /* * transpose on CPU, updating, producing the final lattice diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c similarity index 100% rename from hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c rename to hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile index af7215ff70..4ebf6fc0af 100644 --- a/hpvm/test/parboil/benchmarks/lbm/Makefile +++ b/hpvm/test/parboil/benchmarks/lbm/Makefile @@ -5,9 +5,9 @@ ifeq ($(NUM_CORES),) NUM_CORES=8 endif -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c index a55f0ce785..59aa8daf9a 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c @@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); clFinish(prm.clCommandQueue); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -101,7 +101,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c index 64fe482b81..d93a919df3 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c index 64fe482b81..d93a919df3 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c index 54399ee119..18320b7394 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c @@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); for (t = 1; t <= param.nTimeSteps; t++) { - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c index 6d682e98e6..5e43b75427 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c index 9dc95e7d85..e66cb2c47c 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c +++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c @@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) { LBM_showGridStatistics(TEMP_srcGrid); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_initialize(&prm); @@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) { OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 4; i++) { for (t = 1; t <= param.nTimeSteps; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); @@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_initializeSpecialCellsForLDC(TEMP_srcGrid); LBM_initializeSpecialCellsForLDC(TEMP_dstGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); @@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) { LBM_freeGrid((float **)&TEMP_srcGrid); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCL_LBM_freeGrid(OpenCL_srcGrid); OpenCL_LBM_freeGrid(OpenCL_dstGrid); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile similarity index 85% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile index 5aa206f758..d1664ee988 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=lbm.ll -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES) APP_CXXFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES) diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h rename to hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp rename to hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h rename to hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h rename to hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp similarity index 86% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp rename to hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp index 32db8e9b2c..b51864366b 100644 --- a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp +++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp @@ -8,11 +8,11 @@ /*############################################################################*/ -#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <sys/stat.h> +#include <visc.h> #include "layout_config.h" #include "lbm.h" @@ -92,18 +92,18 @@ typedef struct __attribute__((__packed__)) { void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, srcG, dstG, 1, dstG); + __visc__hint(visc::DEVICE); + __visc__attributes(2, srcG, dstG, 1, dstG); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); srcG += MARGIN; dstG += MARGIN; - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int gy = __hpvm__getNodeInstanceID_y(parentNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); // Using some predefined macros here. Consider this the declaration // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z @@ -274,40 +274,40 @@ void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG, void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, srcG, dstG, 1, dstG); + __visc__hint(visc::DEVICE); + __visc__attributes(2, srcG, dstG, 1, dstG); void *lbm_node = - __hpvm__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); - __hpvm__bindIn(lbm_node, 0, 0, 0); - __hpvm__bindIn(lbm_node, 1, 1, 0); - __hpvm__bindIn(lbm_node, 2, 2, 0); - __hpvm__bindIn(lbm_node, 3, 3, 0); + __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); } void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, srcG, dstG, 1, dstG); - void *lbm_node = __hpvm__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); - __hpvm__bindIn(lbm_node, 0, 0, 0); - __hpvm__bindIn(lbm_node, 1, 1, 0); - __hpvm__bindIn(lbm_node, 2, 2, 0); - __hpvm__bindIn(lbm_node, 3, 3, 0); - __hpvm__bindIn(lbm_node, 4, 4, 0); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); + __visc__bindIn(lbm_node, 4, 4, 0); } void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, srcG, dstG, 1, dstG); - void *lbm_node = __hpvm__createNodeND(0, lbmLvl2); - __hpvm__bindIn(lbm_node, 0, 0, 0); - __hpvm__bindIn(lbm_node, 1, 1, 0); - __hpvm__bindIn(lbm_node, 2, 2, 0); - __hpvm__bindIn(lbm_node, 3, 3, 0); - __hpvm__bindIn(lbm_node, 4, 4, 0); - __hpvm__bindIn(lbm_node, 5, 5, 0); - __hpvm__bindIn(lbm_node, 6, 6, 0); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, srcG, dstG, 1, dstG); + void *lbm_node = __visc__createNodeND(0, lbmLvl2); + __visc__bindIn(lbm_node, 0, 0, 0); + __visc__bindIn(lbm_node, 1, 1, 0); + __visc__bindIn(lbm_node, 2, 2, 0); + __visc__bindIn(lbm_node, 3, 3, 0); + __visc__bindIn(lbm_node, 4, 4, 0); + __visc__bindIn(lbm_node, 5, 5, 0); + __visc__bindIn(lbm_node, 6, 6, 0); } __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src, @@ -321,9 +321,9 @@ __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src, RootIn root_in_local = {src - MARGIN, size, dst - MARGIN, size, SIZE_X, SIZE_Y, SIZE_Z}; *(RootIn *)root_in = root_in_local; - void *lbmDFG = __hpvm__launch(0, lbmLvl3, root_in); + void *lbmDFG = __visc__launch(0, lbmLvl3, root_in); - __hpvm__wait(lbmDFG); + __visc__wait(lbmDFG); } void MAIN_initialize(const MAIN_Param *param) { @@ -379,12 +379,12 @@ int main(int nArgs, char *arg[]) { MAIN_initialize(¶m); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); - llvm_hpvm_track_mem(srcGrid - MARGIN, size); - llvm_hpvm_track_mem(dstGrid - MARGIN, size); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(srcGrid - MARGIN, size); + llvm_visc_track_mem(dstGrid - MARGIN, size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); for (t = 1; t <= param.nTimeSteps; t++) { @@ -404,15 +404,15 @@ int main(int nArgs, char *arg[]) { } pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(srcGrid - MARGIN, size); + llvm_visc_request_mem(srcGrid - MARGIN, size); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(srcGrid - MARGIN); - llvm_hpvm_untrack_mem(dstGrid - MARGIN); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(srcGrid - MARGIN); + llvm_visc_untrack_mem(dstGrid - MARGIN); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ MAIN_finalize(¶m); diff --git a/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h similarity index 100% rename from hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h rename to hpvm/test/parboil/benchmarks/lbm/src/visc/main.h diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile index 4757432d22..ace9ded22b 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = sgemm -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm_sh + VERSION = visc_sh endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile deleted file mode 100644 index 2234bf54e1..0000000000 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# (c) 2010 The Board of Trustees of the University of Illinois. - -LANGUAGE=hpvm -SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll -APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS=-ffast-math -O3 -APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc index e8d1c69ec9..5489f6a55c 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc @@ -109,7 +109,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -212,7 +212,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc index 4285a52a01..105baf590d 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc index 7edbf05a4b..f72c18c293 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc index cccec04beb..744ee40966 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc @@ -110,7 +110,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -254,7 +254,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc index 36e7b93571..45ed8e942a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc index 2cc311d1ef..d8275be777 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - // pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + // pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // for(int i=0; i<15; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -212,7 +212,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc index 678b4d8131..b4e561ded6 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc @@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // for(int i=0; i<15; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); clStatus = clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll index 9b4cf7702d..ca9fcca060 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc index 79fecfb84b..8de437a4f8 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc @@ -195,7 +195,7 @@ int main(int argc, char *argv[]) { &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // Use standard sgemm interface regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll index 2f72a6ceba..908c7104bb 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc index 22f66ca0a8..06f5da5c31 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc @@ -190,7 +190,7 @@ int main(int argc, char *argv[]) { &matC.front(), 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); // Use standard sgemm interface regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB, diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc index 10e0445453..b22ebd8804 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -214,7 +214,7 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc index 59da9562a1..a7cb9793e8 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc @@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 4; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, 0, NULL, NULL); @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc index 5069484492..713fd9e889 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc @@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha, clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta); CHECK_ERROR("clSetKernelArg") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 200; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db, 0, NULL, NULL); @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc index bad8253870..7d5d75c533 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) { pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) { clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0, NULL, NULL); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseMemObject(dA); diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile index 6e63f83841..d1f6c96d0c 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O1 APP_CXXFLAGS=-ffast-math -O1 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc similarity index 69% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc index de36705707..627f5a8241 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -57,17 +57,17 @@ typedef struct __attribute__((__packed__)) { void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(3, A, B, C, 1, C); - - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int ly = __hpvm__getNodeInstanceID_y(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int gy = __hpvm__getNodeInstanceID_y(parentNode); - int gridx = __hpvm__getNumNodeInstances_x(thisNode); - int gridy = __hpvm__getNumNodeInstances_y(thisNode); + __visc__hint(visc::DEVICE); + __visc__attributes(3, A, B, C, 1, C); + + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int gridx = __visc__getNumNodeInstances_x(thisNode); + int gridy = __visc__getNumNodeInstances_y(thisNode); int m = gx * gridx + lx; int n = gy * gridy + ly; @@ -83,46 +83,46 @@ void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__hint(visc::DEVICE); + __visc__attributes(3, A, B, C, 1, C); void *sgemm_node = - __hpvm__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1); - __hpvm__bindIn(sgemm_node, 0, 0, 0); - __hpvm__bindIn(sgemm_node, 1, 1, 0); - __hpvm__bindIn(sgemm_node, 2, 2, 0); - __hpvm__bindIn(sgemm_node, 3, 3, 0); - __hpvm__bindIn(sgemm_node, 4, 4, 0); - __hpvm__bindIn(sgemm_node, 5, 5, 0); - __hpvm__bindIn(sgemm_node, 6, 6, 0); - __hpvm__bindIn(sgemm_node, 7, 7, 0); - __hpvm__bindIn(sgemm_node, 8, 8, 0); - __hpvm__bindIn(sgemm_node, 9, 9, 0); - __hpvm__bindIn(sgemm_node, 10, 10, 0); - __hpvm__bindIn(sgemm_node, 11, 11, 0); + __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); } void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); void *sgemm_node = - __hpvm__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2); - __hpvm__bindIn(sgemm_node, 0, 0, 0); - __hpvm__bindIn(sgemm_node, 1, 1, 0); - __hpvm__bindIn(sgemm_node, 2, 2, 0); - __hpvm__bindIn(sgemm_node, 3, 3, 0); - __hpvm__bindIn(sgemm_node, 4, 4, 0); - __hpvm__bindIn(sgemm_node, 5, 5, 0); - __hpvm__bindIn(sgemm_node, 6, 6, 0); - __hpvm__bindIn(sgemm_node, 7, 7, 0); - __hpvm__bindIn(sgemm_node, 8, 8, 0); - __hpvm__bindIn(sgemm_node, 9, 9, 0); - __hpvm__bindIn(sgemm_node, 10, 10, 0); - __hpvm__bindIn(sgemm_node, 11, 11, 0); - __hpvm__bindIn(sgemm_node, 12, 12, 0); - __hpvm__bindIn(sgemm_node, 13, 13, 0); + __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); + __visc__bindIn(sgemm_node, 12, 12, 0); + __visc__bindIn(sgemm_node, 13, 13, 0); } // A wrapper level used in codegen for some backends @@ -130,25 +130,25 @@ void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B, int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); - void *sgemm_node = __hpvm__createNodeND(0, basicSgemmLvl2); - __hpvm__bindIn(sgemm_node, 0, 0, 0); - __hpvm__bindIn(sgemm_node, 1, 1, 0); - __hpvm__bindIn(sgemm_node, 2, 2, 0); - __hpvm__bindIn(sgemm_node, 3, 3, 0); - __hpvm__bindIn(sgemm_node, 4, 4, 0); - __hpvm__bindIn(sgemm_node, 5, 5, 0); - __hpvm__bindIn(sgemm_node, 6, 6, 0); - __hpvm__bindIn(sgemm_node, 7, 7, 0); - __hpvm__bindIn(sgemm_node, 8, 8, 0); - __hpvm__bindIn(sgemm_node, 9, 9, 0); - __hpvm__bindIn(sgemm_node, 10, 10, 0); - __hpvm__bindIn(sgemm_node, 11, 11, 0); - __hpvm__bindIn(sgemm_node, 12, 12, 0); - __hpvm__bindIn(sgemm_node, 13, 13, 0); - __hpvm__bindIn(sgemm_node, 14, 14, 0); - __hpvm__bindIn(sgemm_node, 15, 15, 0); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2); + __visc__bindIn(sgemm_node, 0, 0, 0); + __visc__bindIn(sgemm_node, 1, 1, 0); + __visc__bindIn(sgemm_node, 2, 2, 0); + __visc__bindIn(sgemm_node, 3, 3, 0); + __visc__bindIn(sgemm_node, 4, 4, 0); + __visc__bindIn(sgemm_node, 5, 5, 0); + __visc__bindIn(sgemm_node, 6, 6, 0); + __visc__bindIn(sgemm_node, 7, 7, 0); + __visc__bindIn(sgemm_node, 8, 8, 0); + __visc__bindIn(sgemm_node, 9, 9, 0); + __visc__bindIn(sgemm_node, 10, 10, 0); + __visc__bindIn(sgemm_node, 11, 11, 0); + __visc__bindIn(sgemm_node, 12, 12, 0); + __visc__bindIn(sgemm_node, 13, 13, 0); + __visc__bindIn(sgemm_node, 14, 14, 0); + __visc__bindIn(sgemm_node, 15, 15, 0); } __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, @@ -194,8 +194,8 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, dg[0] / db[0], dg[1] / db[1]}; *(RootIn *)root_in = root_in_local; - void *sgemmDFG = __hpvm__launch(0, basicSgemmLvl3, root_in); - __hpvm__wait(sgemmDFG); + void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -233,7 +233,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -246,9 +246,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -263,16 +263,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile index 2234bf54e1..f74ee8921a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc index a1db2e56a5..62f9285e8a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -42,8 +42,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __hpvm__hint(hpvm::GPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__hint(visc::GPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -96,10 +96,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, // unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N}; unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N}; - unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -142,9 +142,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -159,16 +159,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile similarity index 86% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile index f81bac4707..a0fd0e9575 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc similarity index 65% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc index de0d473ed6..05d143b588 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -82,29 +82,29 @@ void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B, } void Allocation(long block_x, long block_y) { - void *shB = __hpvm__malloc(block_x * block_y * sizeof(float)); - __hpvm__return(2, shB, block_x * block_y * sizeof(float)); + void *shB = __visc__malloc(block_x * block_y * sizeof(float)); + __visc__return(2, shB, block_x * block_y * sizeof(float)); } void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k, float alpha, float beta, float *shB, size_t bytesshB) { - __hpvm__hint(hpvm::DEVICE); - //__hpvm__hint(hpvm::SPIR_TARGET); - //__hpvm__hint(hpvm::GPU_TARGET); + __visc__hint(visc::DEVICE); + //__visc__hint(visc::SPIR_TARGET); + //__visc__hint(visc::GPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__attributes(3, A, B, C, 1, C); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); - long lx = __hpvm__getNodeInstanceID_x(thisNode); - long ly = __hpvm__getNodeInstanceID_y(thisNode); + long lx = __visc__getNodeInstanceID_x(thisNode); + long ly = __visc__getNodeInstanceID_y(thisNode); - long gx = __hpvm__getNodeInstanceID_x(parentNode); - long gy = __hpvm__getNodeInstanceID_y(parentNode); + long gx = __visc__getNodeInstanceID_x(parentNode); + long gy = __visc__getNodeInstanceID_y(parentNode); - long dimx = __hpvm__getNumNodeInstances_x(thisNode); + long dimx = __visc__getNumNodeInstances_x(thisNode); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -119,7 +119,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, // shB[ly][lx] = B[n+(i+ly)*ldb]; shB[ly * dimx + lx] = B[n + (i + ly) * ldb]; - __hpvm__barrier(); + __visc__barrier(); for (int j = 0; j < TILE_TB_HEIGHT; j++) { a = A[m + (i + j) * lda]; for (int kk = 0; kk < TILE_N; kk++) { @@ -127,7 +127,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, c[kk] += a * shB[j * dimx + kk]; } } - __hpvm__barrier(); + __visc__barrier(); } int t = ldc * gy * TILE_N + m; @@ -140,31 +140,31 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB, void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k, float alpha, float beta, long block_x, long block_y) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); - void *AllocationNode = __hpvm__createNodeND(0, Allocation); - void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y); // Bind edges - __hpvm__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A - __hpvm__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA - __hpvm__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda - __hpvm__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B - __hpvm__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB - __hpvm__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb - __hpvm__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C - __hpvm__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC - __hpvm__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc - __hpvm__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k - __hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha - __hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta - - __hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x - __hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y + __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta + + __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x + __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y // Create Edges between AllocationNode and BFSLeafNodeNode - __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B - __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, + __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B + __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, 0); // Edge bytes_local_B } @@ -175,25 +175,25 @@ void SgemmRoot(float *A, size_t bytesA, int lda, // 0-2 int k, float alpha, float beta, // 9-11 long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); - void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y); // Bind edges - __hpvm__bindIn(SgemmTBNode, 0, 0, 0); // Bind A - __hpvm__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA - __hpvm__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda - __hpvm__bindIn(SgemmTBNode, 3, 3, 0); // Bind B - __hpvm__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB - __hpvm__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb - __hpvm__bindIn(SgemmTBNode, 6, 6, 0); // Bind C - __hpvm__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC - __hpvm__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc - __hpvm__bindIn(SgemmTBNode, 9, 9, 0); // Bind k - __hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha - __hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta - __hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x - __hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y + __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y } void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2 @@ -202,27 +202,27 @@ void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2 int k, float alpha, float beta, // 9-11 long block_x, long block_y, long grid_x, long grid_y // 12-15 ) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); - void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void *SgemmRootNode = __visc__createNodeND(0, SgemmRoot); // Bind edges - __hpvm__bindIn(SgemmRootNode, 0, 0, 0); // Bind A - __hpvm__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA - __hpvm__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda - __hpvm__bindIn(SgemmRootNode, 3, 3, 0); // Bind B - __hpvm__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB - __hpvm__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb - __hpvm__bindIn(SgemmRootNode, 6, 6, 0); // Bind C - __hpvm__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC - __hpvm__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc - __hpvm__bindIn(SgemmRootNode, 9, 9, 0); // Bind k - __hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha - __hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta - __hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x - __hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y - __hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x - __hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y + __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y + __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x + __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y } // Creates root node for sgemm @@ -262,10 +262,10 @@ __attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers, packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, block_x, block_y, grid_x, grid_y); - pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION); - void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args); + pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION); + void *sgemmDFG = __visc__launch(0, SgemmWrapper, (void *)args); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); pb_SwitchToTimer(timers, pb_TimerID_COMPUTE); } @@ -296,7 +296,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -309,9 +309,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -324,16 +324,16 @@ int main(int argc, char *argv[]) { C_sz, matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (params->outFile) { /* Write C to file */ diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile index 2234bf54e1..f74ee8921a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc index be39d713d5..0dfcdfb835 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -40,7 +40,7 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __hpvm__attributes(3, A, B, C, 1, C); + __visc__attributes(3, A, B, C, 1, C); float c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0.0f; int m = 4 * get_global_id(0); @@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) { std::vector<float> matA, matBT; pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ @@ -138,9 +138,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -158,22 +158,22 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* Write C to file */ - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_IO); writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 << std::endl; pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile index 2234bf54e1..f74ee8921a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc index be39d713d5..0dfcdfb835 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -40,7 +40,7 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __hpvm__attributes(3, A, B, C, 1, C); + __visc__attributes(3, A, B, C, 1, C); float c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0.0f; int m = 4 * get_global_id(0); @@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / 4, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) { std::vector<float> matA, matBT; pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); /* Read command line. Expect 3 inputs: A, B and B^T in column-major layout*/ @@ -138,9 +138,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -158,22 +158,22 @@ int main(int argc, char *argv[]) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* Write C to file */ - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); pb_SwitchToTimer(&timers, pb_TimerID_IO); writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); } - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9 << std::endl; pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); pb_FreeParameters(params); return 0; diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll new file mode 100644 index 0000000000..ea1e7b3b7c --- /dev/null +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll @@ -0,0 +1,894 @@ +; ModuleID = 'build/visc_tc_vec_default/main.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } +%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } +%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } +%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } +%"struct.std::ios_base::_Words" = type { i8*, i64 } +%"class.std::locale" = type { %"class.std::locale::_Impl"* } +%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } +%"class.std::locale::facet" = type { i32 (...)**, i32 } +%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } +%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } +%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } +%struct.__locale_data = type opaque +%"class.std::num_put" = type { %"class.std::locale::facet" } +%"class.std::num_get" = type { %"class.std::locale::facet" } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%rtype = type {} +%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } +%struct.pb_Parameters = type { i8*, i8** } + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@_ZSt4cerr = external global %"class.std::basic_ostream" +@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 +@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 +@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 +@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 +@stderr = external global %struct._IO_FILE* +@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 +@_ZSt4cout = external global %"class.std::basic_ostream" +@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 +@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] +@viscTimerSet_GenVISC = common global i8* null +@0 = internal constant [14 x i8] c"GenVISC_Timer\00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 + +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() + %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %2 = mul i32 %0, %1 + %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %4 = add i32 %2, %3 + %mul = shl nsw i32 %4, 2 + %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %7 = mul i32 %5, %6 + %8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %9 = add i32 %7, %8 + %cmp147 = icmp sgt i32 %k, 0 + %add3144 = or i32 %mul, 1 + %add8145 = or i32 %mul, 2 + %add13146 = or i32 %mul, 3 + + %mul.tmp1 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %mul, i32 0 + %mul.tmp2 = insertelement <4 x i32> %mul.tmp1, i32 %add3144, i32 1 + %mul.tmp3 = insertelement <4 x i32> %mul.tmp2, i32 %add8145, i32 2 + %mul.vector = insertelement <4 x i32> %mul.tmp2, i32 %add13146, i32 3 + + %lda.tmp = insertelement <1 x i32> < i32 0 >, i32 %lda, i32 0 + %lda.vector = shufflevector <1 x i32> %lda.tmp, <1 x i32> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > + + br i1 %cmp147, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + +; %c0.0152 = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] +; %c1.0151 = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] +; %c2.0150 = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] +; %c3.0149 = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] + %c.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] + + %10 = trunc i64 %indvars.iv to i32 + %mul2 = mul nsw i32 %10, %lda + +; %add = add nsw i32 %mul2, %mul +; %idxprom = sext i32 %add to i64 +; %arrayidx = getelementptr inbounds float* %A, i64 %idxprom +; %11 = load float* %arrayidx, align 4, !tbaa !0 +; %add5 = add nsw i32 %mul2, %add3144 +; %idxprom6 = sext i32 %add5 to i64 +; %arrayidx7 = getelementptr inbounds float* %A, i64 %idxprom6 +; %12 = load float* %arrayidx7, align 4, !tbaa !0 +; %add10 = add nsw i32 %mul2, %add8145 +; %idxprom11 = sext i32 %add10 to i64 +; %arrayidx12 = getelementptr inbounds float* %A, i64 %idxprom11 +; %13 = load float* %arrayidx12, align 4, !tbaa !0 +; %add15 = add nsw i32 %mul2, %add13146 +; %idxprom16 = sext i32 %add15 to i64 +; %arrayidx17 = getelementptr inbounds float* %A, i64 %idxprom16 +; %14 = load float* %arrayidx17, align 4, !tbaa !0 + %add = add nsw i32 %mul2, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float* %A, i64 %idxprom + %arrayidx.cast = bitcast float* %arrayidx to <4 x float>* + %11 = load <4 x float>* %arrayidx.cast, align 4 + + %mul18 = mul nsw i32 %10, %ldb + %add19 = add nsw i32 %mul18, %9 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds float* %B, i64 %idxprom20 +; %15 = load float* %arrayidx21, align 4, !tbaa !0 + %12 = load float* %arrayidx21, align 4, !tbaa !0 + + %b.tmp = insertelement <1 x float> < float 0.000000e+00 >, float %12, i32 0 + %b.vector = shufflevector <1 x float> %b.tmp, <1 x float> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 > + +; %mul22 = fmul fast float %11, %15 +; %add23 = fadd fast float %c0.0152, %mul22 +; %mul24 = fmul fast float %12, %15 +; %add25 = fadd fast float %c1.0151, %mul24 +; %mul26 = fmul fast float %13, %15 +; %add27 = fadd fast float %c2.0150, %mul26 +; %mul28 = fmul fast float %14, %15 +; %add29 = fadd fast float %c3.0149, %mul28 + %mul22 = fmul fast <4 x float> %11, %b.vector + %add23 = fadd fast <4 x float> %c.vector, %mul22 + + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry +; %c0.0.lcssa = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ] +; %c1.0.lcssa = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ] +; %c2.0.lcssa = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ] +; %c3.0.lcssa = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ] + %c.end.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ] + + %c0.0.lcssa = extractelement <4 x float> %c.end.vector, i32 0 + %c1.0.lcssa = extractelement <4 x float> %c.end.vector, i32 1 + %c2.0.lcssa = extractelement <4 x float> %c.end.vector, i32 2 + %c3.0.lcssa = extractelement <4 x float> %c.end.vector, i32 3 + + %mul30 = mul nsw i32 %9, %ldc + %add31 = add nsw i32 %mul30, %mul + %idxprom32 = sext i32 %add31 to i64 + %arrayidx33 = getelementptr inbounds float* %C, i64 %idxprom32 + +; %16 = load float* %arrayidx33, align 4, !tbaa !0 +; %mul34 = fmul fast float %16, %beta + %13 = load float* %arrayidx33, align 4, !tbaa !0 + %mul34 = fmul fast float %13, %beta + + %mul35 = fmul fast float %c0.0.lcssa, %alpha + %add36 = fadd fast float %mul35, %mul34 + store float %add36, float* %arrayidx33, align 4, !tbaa !0 + %add43 = add nsw i32 %add3144, %mul30 + %idxprom44 = sext i32 %add43 to i64 + %arrayidx45 = getelementptr inbounds float* %C, i64 %idxprom44 + +; %17 = load float* %arrayidx45, align 4, !tbaa !0 +; %mul46 = fmul fast float %17, %beta + %14 = load float* %arrayidx45, align 4, !tbaa !0 + %mul46 = fmul fast float %14, %beta + + %mul47 = fmul fast float %c1.0.lcssa, %alpha + %add48 = fadd fast float %mul47, %mul46 + store float %add48, float* %arrayidx45, align 4, !tbaa !0 + %add56 = add nsw i32 %add8145, %mul30 + %idxprom57 = sext i32 %add56 to i64 + %arrayidx58 = getelementptr inbounds float* %C, i64 %idxprom57 + +; %18 = load float* %arrayidx58, align 4, !tbaa !0 +; %mul59 = fmul fast float %18, %beta + %15 = load float* %arrayidx58, align 4, !tbaa !0 + %mul59 = fmul fast float %15, %beta + + %mul60 = fmul fast float %c2.0.lcssa, %alpha + %add61 = fadd fast float %mul60, %mul59 + store float %add61, float* %arrayidx58, align 4, !tbaa !0 + %add69 = add nsw i32 %add13146, %mul30 + %idxprom70 = sext i32 %add69 to i64 + %arrayidx71 = getelementptr inbounds float* %C, i64 %idxprom70 + +; %19 = load float* %arrayidx71, align 4, !tbaa !0 +; %mul72 = fmul fast float %19, %beta + %16 = load float* %arrayidx71, align 4, !tbaa !0 + %mul72 = fmul fast float %16, %beta + + %mul73 = fmul fast float %c3.0.lcssa, %alpha + %add74 = fadd fast float %mul73, %mul72 + store float %add74, float* %arrayidx71, align 4, !tbaa !0 + ret %rtype undef +} + +; Function Attrs: noinline nounwind uwtable +define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { +entry: + switch i8 %transa, label %if.then [ + i8 78, label %if.end + i8 110, label %if.end + ] + +if.then: ; preds = %entry + %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 + %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %0 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %0, align 8 + %add.ptr.i.sum = add i64 %vbase.offset.i, 240 + %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum + %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** + %2 = load %"class.std::ctype"** %1, align 8, !tbaa !4 + %tobool.i97 = icmp eq %"class.std::ctype"* %2, null + br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + +if.then.i98: ; preds = %if.then + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then + %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 + %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !1 + %tobool.i = icmp eq i8 %3, 0 + br i1 %tobool.i, label %if.end.i, label %if.then.i + +if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 + %4 = load i8* %arrayidx.i, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 + %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !3 + %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6 + %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 + %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i + %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ] + %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 + %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1 + br label %return + +if.end: ; preds = %entry, %entry + switch i8 %transb, label %if.then9 [ + i8 84, label %if.end12 + i8 116, label %if.end12 + ] + +if.then9: ; preds = %if.end + %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 + %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3 + %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24 + %7 = bitcast i8* %vbase.offset.ptr.i52 to i64* + %vbase.offset.i53 = load i64* %7, align 8 + %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240 + %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum + %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"** + %9 = load %"class.std::ctype"** %8, align 8, !tbaa !4 + %tobool.i100 = icmp eq %"class.std::ctype"* %9, null + br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + +if.then.i101: ; preds = %if.then9 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9 + %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 + %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !1 + %tobool.i76 = icmp eq i8 %10, 0 + br i1 %tobool.i76, label %if.end.i82, label %if.then.i78 + +if.then.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 + %11 = load i8* %arrayidx.i77, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +if.end.i82: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 + %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !3 + %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6 + %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8 + %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +_ZNKSt5ctypeIcE5widenEc.exit84: ; preds = %if.end.i82, %if.then.i78 + %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ] + %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1 + %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1 + br label %return + +if.end12: ; preds = %if.end, %if.end + %rem44 = and i32 %m, 15 + %tobool = icmp eq i32 %rem44, 0 + br i1 %tobool, label %lor.lhs.false, label %if.then15 + +lor.lhs.false: ; preds = %if.end12 + %rem1345 = and i32 %n, 15 + %tobool14 = icmp eq i32 %rem1345, 0 + br i1 %tobool14, label %if.end21, label %if.then15 + +if.then15: ; preds = %lor.lhs.false, %if.end12 + %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 + %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1 + %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 + %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1 + %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** + %vtable.i63 = load i8** %14, align 8, !tbaa !3 + %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24 + %15 = bitcast i8* %vbase.offset.ptr.i64 to i64* + %vbase.offset.i65 = load i64* %15, align 8 + %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* + %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240 + %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum + %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"** + %18 = load %"class.std::ctype"** %17, align 8, !tbaa !4 + %tobool.i104 = icmp eq %"class.std::ctype"* %18, null + br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + +if.then.i105: ; preds = %if.then15 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15 + %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 + %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !1 + %tobool.i88 = icmp eq i8 %19, 0 + br i1 %tobool.i88, label %if.end.i94, label %if.then.i90 + +if.then.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 + %20 = load i8* %arrayidx.i89, align 1, !tbaa !1 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +if.end.i94: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 + %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !3 + %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6 + %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8 + %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +_ZNKSt5ctypeIcE5widenEc.exit96: ; preds = %if.end.i94, %if.then.i90 + %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ] + %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1 + %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1 + br label %if.end21 + +if.end21: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false + %div = sdiv i32 %m, 16 + %mul = and i32 %div, 1073741823 + %div22 = sdiv i32 %n, 16 + %mul24 = and i32 %div22, 268435455 + %conv33 = fpext float %alpha to double + %conv34 = fpext float %beta to double + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) + %in.addr = alloca %struct.arg + %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + store float* %A, float** %in.addr.A + %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + store i64 %bytesA, i64* %in.addr.bytes_A + %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + store i32 %lda, i32* %in.addr.lda + %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + store float* %B, float** %in.addr.B + %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + store i64 %bytesB, i64* %in.addr.bytes_B + %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + store i32 %ldb, i32* %in.addr.ldb + %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + store float* %C, float** %in.addr.C + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + store i64 %bytesC, i64* %in.addr.bytes_C + %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store i32 %ldc, i32* %in.addr.ldc + %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + store i32 %k, i32* %in.addr.k + %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 + %in.addr.alpha.cast = fptrunc double %conv33 to float + store float %in.addr.alpha.cast, float* %in.addr.alpha + %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 + %in.addr.beta.cast = fptrunc double %conv34 to float + store float %in.addr.beta.cast, float* %in.addr.beta + %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 + store i32 4, i32* %in.addr.dimX0 + %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 + store i32 16, i32* %in.addr.dimY0 + %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 + store i32 %mul, i32* %in.addr.dimX1 + %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 + store i32 %mul24, i32* %in.addr.dimY1 + %args = bitcast %struct.arg* %in.addr to i8* + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) + call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) + br label %return + +return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit + ret void +} + +declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %timers = alloca %struct.pb_TimerSet, align 8 + %matArow = alloca i32, align 4 + %matAcol = alloca i32, align 4 + %matBrow = alloca i32, align 4 + %matBcol = alloca i32, align 4 + %matA = alloca %"class.std::vector", align 8 + %matBT = alloca %"class.std::vector", align 8 + %matC = alloca %"class.std::vector", align 8 + store i32 %argc, i32* %argc.addr, align 4, !tbaa !5 + %0 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 800, i8* %0) #1 + %1 = bitcast %"class.std::vector"* %matA to i8* + call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 + %2 = bitcast %"class.std::vector"* %matBT to i8* + call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 + %3 = call i8* @llvm_visc_initializeTimerSet() + store i8* %3, i8** @viscTimerSet_GenVISC + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + call void @llvm.visc.init() + %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 + %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 + %4 = load i8*** %inpFiles, align 8, !tbaa !4 + %5 = load i8** %4, align 8, !tbaa !4 + %cmp = icmp eq i8* %5, null + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %arrayidx2 = getelementptr inbounds i8** %4, i64 1 + %6 = load i8** %arrayidx2, align 8, !tbaa !4 + %cmp3 = icmp eq i8* %6, null + br i1 %cmp3, label %if.then, label %lor.lhs.false4 + +lor.lhs.false4: ; preds = %lor.lhs.false + %arrayidx6 = getelementptr inbounds i8** %4, i64 2 + %7 = load i8** %arrayidx6, align 8, !tbaa !4 + %cmp7 = icmp eq i8* %7, null + br i1 %cmp7, label %if.then, label %lor.lhs.false8 + +lor.lhs.false8: ; preds = %lor.lhs.false4 + %arrayidx10 = getelementptr inbounds i8** %4, i64 3 + %8 = load i8** %arrayidx10, align 8, !tbaa !4 + %cmp11 = icmp eq i8* %8, null + br i1 %cmp11, label %if.end, label %if.then + +if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry + %9 = load %struct._IO_FILE** @stderr, align 8, !tbaa !4 + %10 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %9) + call void @exit(i32 -1) #7 + unreachable + +if.end: ; preds = %lor.lhs.false8 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 + %11 = load i8*** %inpFiles, align 8, !tbaa !4 + %12 = load i8** %11, align 8, !tbaa !4 + %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %12, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 + %13 = load i8*** %inpFiles, align 8, !tbaa !4 + %arrayidx17 = getelementptr inbounds i8** %13, i64 2 + %14 = load i8** %arrayidx17, align 8, !tbaa !4 + %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %14, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %15 = load i32* %matArow, align 4, !tbaa !5 + %16 = load i32* %matAcol, align 4, !tbaa !5 + %mul = mul nsw i32 %16, %15 + %conv = sext i32 %mul to i64 + %mul19 = shl nsw i64 %conv, 2 + %17 = load i32* %matBrow, align 4, !tbaa !5 + %18 = load i32* %matBcol, align 4, !tbaa !5 + %mul20 = mul nsw i32 %18, %17 + %conv21 = sext i32 %mul20 to i64 + %mul22 = shl nsw i64 %conv21, 2 + %mul23 = mul nsw i32 %18, %15 + %conv24 = sext i32 %mul23 to i64 + %mul25 = shl nsw i64 %conv24, 2 + %19 = bitcast %"class.std::vector"* %matC to i8* + call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 24, i32 8, i1 false) #1 + %cmp.i.i.i.i = icmp eq i32 %mul23, 0 + br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i + +cond.true.i.i.i.i: ; preds = %if.end + %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6 + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i + call void @_ZSt17__throw_bad_allocv() #7 + unreachable + +_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i + %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 + %20 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end + %cond.i.i.i.i = phi float* [ %20, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] + %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 + store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !4 + %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 + store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 + %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 + %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 + store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !4 + br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i + +for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 + %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i + %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 + %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i + br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] + %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i + %21 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %21, align 4 + %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 + %22 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i + %23 = bitcast float* %22 to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %23, align 4 + %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 + %24 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i + br i1 %24, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 + br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader + +for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* + %25 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 + call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %25, i32 4, i1 false) + br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + +_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4 + %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 + %26 = load float** %_M_start.i.i, align 8, !tbaa !4 + %27 = bitcast float* %26 to i8* + call void @llvm_visc_track_mem(i8* %27, i64 %mul19) #1 + %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 + %28 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %29 = bitcast float* %28 to i8* + call void @llvm_visc_track_mem(i8* %29, i64 %mul22) #1 + %30 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %31 = bitcast float* %30 to i8* + call void @llvm_visc_track_mem(i8* %31, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %32 = load float** %_M_finish.i.i.i, align 8, !tbaa !4 + %33 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %cmp3399 = icmp eq float* %32, %33 + br i1 %cmp3399, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + %sub.ptr.lhs.cast.i = ptrtoint float* %32 to i64 + %sub.ptr.rhs.cast.i = ptrtoint float* %33 to i64 + %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %add.ptr.i = getelementptr inbounds float* %33, i64 %i.0100 + store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !0 + %inc = add i64 %i.0100, 1 + %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i + br i1 %cmp33, label %for.body, label %for.end + +for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %34 = load i32* %matArow, align 4, !tbaa !5 + %35 = load i32* %matBcol, align 4, !tbaa !5 + %36 = load i32* %matAcol, align 4, !tbaa !5 + %37 = load float** %_M_start.i.i, align 8, !tbaa !4 + %38 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %39 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %34, i32 %35, i32 %36, float 1.000000e+00, float* %37, i64 %mul19, i32 %34, float* %38, i64 %mul22, i32 %35, float 0.000000e+00, float* %39, i64 %mul25, i32 %34) + %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 + %40 = load i8** %outFile, align 8, !tbaa !4 + %tobool = icmp eq i8* %40, null + br i1 %tobool, label %if.end42, label %if.then38 + +if.then38: ; preds = %for.end + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 + %41 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %42 = bitcast float* %41 to i8* + call void @llvm_visc_request_mem(i8* %42, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1 + %43 = load i8** %outFile, align 8, !tbaa !4 + %44 = load i32* %matArow, align 4, !tbaa !5 + %45 = load i32* %matBcol, align 4, !tbaa !5 + %call41 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %43, i32 %44, i32 %45, %"class.std::vector"* %matC) #1 + br label %if.end42 + +if.end42: ; preds = %if.then38, %for.end + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 + %46 = load float** %_M_start.i.i, align 8, !tbaa !4 + %47 = bitcast float* %46 to i8* + call void @llvm_visc_untrack_mem(i8* %47) #1 + %48 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %49 = bitcast float* %48 to i8* + call void @llvm_visc_untrack_mem(i8* %49) #1 + %50 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %51 = bitcast float* %50 to i8* + call void @llvm_visc_untrack_mem(i8* %51) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 + %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 + %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 + %52 = load i32* %matArow, align 4, !tbaa !5 + %conv50 = sitofp i32 %52 to double + %mul51 = fmul fast double %conv50, 2.000000e+00 + %53 = load i32* %matBcol, align 4, !tbaa !5 + %conv52 = sitofp i32 %53 to double + %mul53 = fmul fast double %mul51, %conv52 + %54 = load i32* %matAcol, align 4, !tbaa !5 + %conv54 = sitofp i32 %54 to double + %mul55 = fmul fast double %mul53, %conv54 + %div = fdiv fast double %mul55, %call48 + %div56 = fmul double %div, 1.000000e-09 + %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 + %55 = bitcast %"class.std::basic_ostream"* %call.i to i8** + %vtable.i = load i8** %55, align 8, !tbaa !3 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %56 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %56, align 8 + %57 = bitcast %"class.std::basic_ostream"* %call.i to i8* + %add.ptr.sum.i = add i64 %vbase.offset.i, 240 + %_M_ctype.i.i = getelementptr inbounds i8* %57, i64 %add.ptr.sum.i + %58 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** + %59 = load %"class.std::ctype"** %58, align 8, !tbaa !4 + %tobool.i.i.i = icmp eq %"class.std::ctype"* %59, null + br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + +if.then.i.i.i: ; preds = %if.end42 + call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end42 + %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 6 + %60 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !1 + %tobool.i3.i.i = icmp eq i8 %60, 0 + br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i + +if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 7, i64 10 + %61 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %59) #1 + %62 = bitcast %"class.std::ctype"* %59 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %62, align 8, !tbaa !3 + %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 + %63 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 + %call.i.i.i = call signext i8 %63(%"class.std::ctype"* %59, i8 signext 10) #1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i + %retval.0.i.i.i = phi i8 [ %61, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] + %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 + %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 + %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 + call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) + call void @llvm.visc.cleanup() + call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 + %64 = load float** %_M_start.i.i.i81, align 8, !tbaa !4 + %tobool.i.i.i.i78 = icmp eq float* %64, null + br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 + +if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %65 = bitcast float* %64 to i8* + call void @_ZdlPv(i8* %65) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 + +_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %66 = load float** %_M_start.i.i82, align 8, !tbaa !4 + %tobool.i.i.i.i74 = icmp eq float* %66, null + br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 + +if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %67 = bitcast float* %66 to i8* + call void @_ZdlPv(i8* %67) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 + +_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %68 = load float** %_M_start.i.i, align 8, !tbaa !4 + %tobool.i.i.i.i = icmp eq float* %68, null + br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i + +if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 + %69 = bitcast float* %68 to i8* + call void @_ZdlPv(i8* %69) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit + +_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 + call void @llvm.lifetime.end(i64 800, i8* %0) #1 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 + +; Function Attrs: noreturn nounwind +declare void @exit(i32) #4 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 + +declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 + +declare void @llvm_visc_track_mem(i8*, i64) #0 + +declare void @llvm_visc_request_mem(i8*, i64) #0 + +declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 + +declare void @llvm_visc_untrack_mem(i8*) #0 + +declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 + +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() #5 + +declare noalias i8* @_Znwm(i64) #0 + +; Function Attrs: nounwind +declare void @_ZdlPv(i8*) #6 + +declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 + +declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 + +; Function Attrs: noreturn +declare void @_ZSt16__throw_bad_castv() #5 + +declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 + +declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 + +; Function Attrs: nounwind +define internal void @_GLOBAL__I_a() #1 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 + ret void +} + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +declare i8* @llvm_visc_initializeTimerSet() + +declare void @llvm_visc_switchToTimer(i8**, i32) + +declare void @llvm_visc_printTimerSet(i8**, i8*) + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.init() #1 + +; Function Attrs: nounwind +declare void @llvm.visc.cleanup() #1 + +attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { noreturn nounwind } + +!0 = metadata !{metadata !"float", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !"vtable pointer", metadata !2} +!4 = metadata !{metadata !"any pointer", metadata !1} +!5 = metadata !{metadata !"int", metadata !1} +!6 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile similarity index 83% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile index 2234bf54e1..f74ee8921a 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc similarity index 90% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc index 286297d6fe..76d0cefc81 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -41,8 +41,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __hpvm__hint(hpvm::GPU_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__hint(visc::GPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); float c = 0.0f; int m = get_global_id(0); @@ -99,10 +99,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ}; unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]}; - unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -145,9 +145,9 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -162,16 +162,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll new file mode 100644 index 0000000000..b6e9e3818e --- /dev/null +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll @@ -0,0 +1,869 @@ +; ModuleID = 'build/visc_vec_default/main.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } +%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } +%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } +%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } +%"struct.std::ios_base::_Words" = type { i8*, i64 } +%"class.std::locale" = type { %"class.std::locale::_Impl"* } +%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } +%"class.std::locale::facet" = type { i32 (...)**, i32 } +%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } +%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } +%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } +%struct.__locale_data = type opaque +%"class.std::num_put" = type { %"class.std::locale::facet" } +%"class.std::num_get" = type { %"class.std::locale::facet" } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%rtype = type {} +%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } +%struct.pb_Parameters = type { i8*, i8** } + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@_ZSt4cerr = external global %"class.std::basic_ostream" +@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 +@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 +@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 +@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 +@stderr = external global %struct._IO_FILE* +@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 +@_ZSt4cout = external global %"class.std::basic_ostream" +@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 +@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] +@viscTimerSet_GenVISC = common global i8* null +@0 = internal constant [14 x i8] c"GenVISC_Timer\00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 + +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 + +; Function Attrs: nounwind readnone +declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() + %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %2 = mul i32 %a0, %1 + %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %4 = add i32 %2, %3 + %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %a7 = mul i32 %5, %a6 + %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + %a9 = add i32 %a7, %a8 + + %a10 = shl i32 %4, 3 + + + ;a10 = %3, a9 = %5 + ;%1 = tail call i64 @_Z13get_global_idj(i32 0) #1 + ;%2 = shl i64 %1, 3 + ;%3 = trunc i64 %2 to i32 + ;%4 = tail call i64 @_Z13get_global_idj(i32 1) #1 + ;%5 = trunc i64 %4 to i32 + + + + + %6 = icmp sgt i32 %k, 0 + br i1 %6, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %.lr.ph, %0 + %cp.021 = phi <8 x float> [ %20, %.lr.ph ], [ zeroinitializer, %0 ] + %i.020 = phi i32 [ %21, %.lr.ph ], [ 0, %0 ] + %7 = mul nsw i32 %i.020, %lda + %8 = add nsw i32 %7, %a10 + %9 = sext i32 %8 to i64 + %10 = getelementptr inbounds float* %A, i64 %9 + %v10 = bitcast float* %10 to <8 x float>* + %11 = load <8 x float>* %v10 + %12 = mul nsw i32 %i.020, %ldb + %13 = add nsw i32 %12, %a9 + %14 = sext i32 %13 to i64 + %15 = getelementptr inbounds float* %B, i64 %14 + %16 = load float* %15, align 4, !tbaa !9 + %17 = insertelement <8 x float> undef, float %16, i32 0 + %18 = shufflevector <8 x float> %17, <8 x float> undef, <8 x i32> zeroinitializer + %19 = fmul <8 x float> %11, %18 + %20 = fadd <8 x float> %cp.021, %19 + %21 = add nsw i32 %i.020, 1 + %22 = icmp slt i32 %21, %k + br i1 %22, label %.lr.ph, label %._crit_edge + +._crit_edge: ; preds = %.lr.ph, %0 + %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %0 ], [ %20, %.lr.ph ] + %23 = mul nsw i32 %a9, %ldc + %24 = add nsw i32 %23, %a10 + %25 = sext i32 %24 to i64 + %26 = getelementptr inbounds float* %C, i64 %25 + %v26 = bitcast float* %26 to <8 x float>* + %27 = load <8 x float>* %v26 + %28 = insertelement <8 x float> undef, float %beta, i32 0 + %29 = shufflevector <8 x float> %28, <8 x float> undef, <8 x i32> zeroinitializer + %30 = insertelement <8 x float> undef, float %alpha, i32 0 + %31 = shufflevector <8 x float> %30, <8 x float> undef, <8 x i32> zeroinitializer + %32 = fmul <8 x float> %31, %cp.0.lcssa + + ;%33 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %27, <8 x float> %29, <8 x float> %32) + + %mul = fmul <8 x float> %27, %29 + %33 = fadd <8 x float> %mul, %32 + store <8 x float> %33, <8 x float>* %v26 + +;entry: + ;%_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() + ;%_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + ;%0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + ;%1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + ;%2 = mul i32 %0, %1 + ;%3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + ;%4 = add i32 %2, %3 + ;%5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + ;%6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + ;%7 = mul i32 %5, %6 + ;%8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + ;%9 = add i32 %7, %8 + ;%cmp32 = icmp sgt i32 %k, 0 + ;br i1 %cmp32, label %for.body, label %for.end + +;for.body: ; preds = %for.body, %entry + ;%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + ;%c.034 = phi float [ %add7, %for.body ], [ 0.000000e+00, %entry ] + ;%10 = trunc i64 %indvars.iv to i32 + ;%mul = mul nsw i32 %10, %lda + ;%add = add nsw i32 %mul, %4 + ;%idxprom = sext i32 %add to i64 + ;%arrayidx = getelementptr inbounds float* %A, i64 %idxprom + ;%11 = load float* %arrayidx, align 4, !tbaa !3 + ;%mul2 = mul nsw i32 %10, %ldb + ;%add3 = add nsw i32 %mul2, %9 + ;%idxprom4 = sext i32 %add3 to i64 + ;%arrayidx5 = getelementptr inbounds float* %B, i64 %idxprom4 + ;%12 = load float* %arrayidx5, align 4, !tbaa !3 + ;%mul6 = fmul fast float %11, %12 + ;%add7 = fadd fast float %c.034, %mul6 + ;%indvars.iv.next = add i64 %indvars.iv, 1 + ;%lftr.wideiv = trunc i64 %indvars.iv.next to i32 + ;%exitcond = icmp eq i32 %lftr.wideiv, %k + ;br i1 %exitcond, label %for.end, label %for.body + +;for.end: ; preds = %for.body, %entry + ;%c.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add7, %for.body ] + ;%mul8 = mul nsw i32 %9, %ldc + ;%add9 = add nsw i32 %mul8, %4 + ;%idxprom10 = sext i32 %add9 to i64 + ;%arrayidx11 = getelementptr inbounds float* %C, i64 %idxprom10 + ;%13 = load float* %arrayidx11, align 4, !tbaa !3 + ;%mul12 = fmul fast float %13, %beta + ;%mul13 = fmul fast float %c.0.lcssa, %alpha + ;%add14 = fadd fast float %mul13, %mul12 + ;store float %add14, float* %arrayidx11, align 4, !tbaa !3 + ret %rtype undef + + +} + +; Function Attrs: noinline nounwind uwtable +define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { +entry: + switch i8 %transa, label %if.then [ + i8 78, label %if.end + i8 110, label %if.end + ] + +if.then: ; preds = %entry + %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 + %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %0 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %0, align 8 + %add.ptr.i.sum = add i64 %vbase.offset.i, 240 + %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum + %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** + %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7 + %tobool.i97 = icmp eq %"class.std::ctype"* %2, null + br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + +if.then.i98: ; preds = %if.then + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then + %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 + %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4 + %tobool.i = icmp eq i8 %3, 0 + br i1 %tobool.i, label %if.end.i, label %if.then.i + +if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 + %4 = load i8* %arrayidx.i, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 + %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6 + %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6 + %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 + %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i + %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ] + %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 + %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1 + br label %return + +if.end: ; preds = %entry, %entry + switch i8 %transb, label %if.then9 [ + i8 84, label %if.end12 + i8 116, label %if.end12 + ] + +if.then9: ; preds = %if.end + %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 + %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 + %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24 + %7 = bitcast i8* %vbase.offset.ptr.i52 to i64* + %vbase.offset.i53 = load i64* %7, align 8 + %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240 + %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum + %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"** + %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7 + %tobool.i100 = icmp eq %"class.std::ctype"* %9, null + br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + +if.then.i101: ; preds = %if.then9 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9 + %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 + %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !4 + %tobool.i76 = icmp eq i8 %10, 0 + br i1 %tobool.i76, label %if.end.i82, label %if.then.i78 + +if.then.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 + %11 = load i8* %arrayidx.i77, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +if.end.i82: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 + %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6 + %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6 + %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8 + %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit84 + +_ZNKSt5ctypeIcE5widenEc.exit84: ; preds = %if.end.i82, %if.then.i78 + %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ] + %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1 + %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1 + br label %return + +if.end12: ; preds = %if.end, %if.end + %rem44 = and i32 %m, 15 + %tobool = icmp eq i32 %rem44, 0 + br i1 %tobool, label %lor.lhs.false, label %if.then15 + +lor.lhs.false: ; preds = %if.end12 + %rem1345 = and i32 %n, 15 + %tobool14 = icmp eq i32 %rem1345, 0 + br i1 %tobool14, label %if.end21, label %if.then15 + +if.then15: ; preds = %lor.lhs.false, %if.end12 + %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 + %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1 + %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 + %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1 + %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** + %vtable.i63 = load i8** %14, align 8, !tbaa !6 + %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24 + %15 = bitcast i8* %vbase.offset.ptr.i64 to i64* + %vbase.offset.i65 = load i64* %15, align 8 + %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* + %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240 + %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum + %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"** + %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7 + %tobool.i104 = icmp eq %"class.std::ctype"* %18, null + br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + +if.then.i105: ; preds = %if.then15 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15 + %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 + %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !4 + %tobool.i88 = icmp eq i8 %19, 0 + br i1 %tobool.i88, label %if.end.i94, label %if.then.i90 + +if.then.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 + %20 = load i8* %arrayidx.i89, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +if.end.i94: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 + %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6 + %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6 + %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8 + %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit96 + +_ZNKSt5ctypeIcE5widenEc.exit96: ; preds = %if.end.i94, %if.then.i90 + %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ] + %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1 + %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1 + br label %if.end21 + +if.end21: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false + %div = sdiv i32 %m, 16 + %mul = and i32 %div, 2147483647 + %div22 = sdiv i32 %n, 16 + %mul24 = and i32 %div22, 268435455 + %conv33 = fpext float %alpha to double + %conv34 = fpext float %beta to double + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) + %in.addr = alloca %struct.arg + %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + store float* %A, float** %in.addr.A + %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + store i64 %bytesA, i64* %in.addr.bytes_A + %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + store i32 %lda, i32* %in.addr.lda + %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + store float* %B, float** %in.addr.B + %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + store i64 %bytesB, i64* %in.addr.bytes_B + %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + store i32 %ldb, i32* %in.addr.ldb + %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + store float* %C, float** %in.addr.C + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + store i64 %bytesC, i64* %in.addr.bytes_C + %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store i32 %ldc, i32* %in.addr.ldc + %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + store i32 %k, i32* %in.addr.k + %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 + %in.addr.alpha.cast = fptrunc double %conv33 to float + store float %in.addr.alpha.cast, float* %in.addr.alpha + %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 + %in.addr.beta.cast = fptrunc double %conv34 to float + store float %in.addr.beta.cast, float* %in.addr.beta + %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 + store i32 2, i32* %in.addr.dimX0 + %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 + store i32 16, i32* %in.addr.dimY0 + %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 + store i32 %mul, i32* %in.addr.dimX1 + %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 + store i32 %mul24, i32* %in.addr.dimY1 + %args = bitcast %struct.arg* %in.addr to i8* + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) + call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) + br label %return + +return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit + ret void +} + +declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %timers = alloca %struct.pb_TimerSet, align 8 + %matArow = alloca i32, align 4 + %matAcol = alloca i32, align 4 + %matBrow = alloca i32, align 4 + %matBcol = alloca i32, align 4 + %matA = alloca %"class.std::vector", align 8 + %matBT = alloca %"class.std::vector", align 8 + %matC = alloca %"class.std::vector", align 8 + store i32 %argc, i32* %argc.addr, align 4, !tbaa !8 + %0 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 800, i8* %0) #1 + %1 = bitcast %"class.std::vector"* %matA to i8* + call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 + %2 = bitcast %"class.std::vector"* %matBT to i8* + call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 + %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 + %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 + %3 = load i8*** %inpFiles, align 8, !tbaa !7 + %4 = load i8** %3, align 8, !tbaa !7 + %cmp = icmp eq i8* %4, null + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %arrayidx2 = getelementptr inbounds i8** %3, i64 1 + %5 = load i8** %arrayidx2, align 8, !tbaa !7 + %cmp3 = icmp eq i8* %5, null + br i1 %cmp3, label %if.then, label %lor.lhs.false4 + +lor.lhs.false4: ; preds = %lor.lhs.false + %arrayidx6 = getelementptr inbounds i8** %3, i64 2 + %6 = load i8** %arrayidx6, align 8, !tbaa !7 + %cmp7 = icmp eq i8* %6, null + br i1 %cmp7, label %if.then, label %lor.lhs.false8 + +lor.lhs.false8: ; preds = %lor.lhs.false4 + %arrayidx10 = getelementptr inbounds i8** %3, i64 3 + %7 = load i8** %arrayidx10, align 8, !tbaa !7 + %cmp11 = icmp eq i8* %7, null + br i1 %cmp11, label %if.end, label %if.then + +if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry + %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7 + %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8) + call void @exit(i32 -1) #7 + unreachable + +if.end: ; preds = %lor.lhs.false8 + %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 + %10 = load i8*** %inpFiles, align 8, !tbaa !7 + %arrayidx17 = getelementptr inbounds i8** %10, i64 2 + %11 = load i8** %arrayidx17, align 8, !tbaa !7 + %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 + %12 = call i8* @llvm_visc_initializeTimerSet() + store i8* %12, i8** @viscTimerSet_GenVISC + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + call void @llvm.visc.init() + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %13 = load i32* %matArow, align 4, !tbaa !8 + %14 = load i32* %matAcol, align 4, !tbaa !8 + %mul = mul nsw i32 %14, %13 + %conv = sext i32 %mul to i64 + %mul19 = shl nsw i64 %conv, 2 + %15 = load i32* %matBrow, align 4, !tbaa !8 + %16 = load i32* %matBcol, align 4, !tbaa !8 + %mul20 = mul nsw i32 %16, %15 + %conv21 = sext i32 %mul20 to i64 + %mul22 = shl nsw i64 %conv21, 2 + %mul23 = mul nsw i32 %16, %13 + %conv24 = sext i32 %mul23 to i64 + %mul25 = shl nsw i64 %conv24, 2 + %17 = bitcast %"class.std::vector"* %matC to i8* + call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1 + %cmp.i.i.i.i = icmp eq i32 %mul23, 0 + br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i + +cond.true.i.i.i.i: ; preds = %if.end + %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9 + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i + call void @_ZSt17__throw_bad_allocv() #7 + unreachable + +_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i + %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 + %18 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end + %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] + %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 + store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7 + %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 + store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 + %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 + %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 + store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7 + br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i + +for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 + %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i + %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 + %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i + br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] + %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i + %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %19, align 4 + %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 + %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i + %21 = bitcast float* %20 to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %21, align 4 + %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 + %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i + br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 + br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader + +for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* + %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 + call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false) + br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + +_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 + %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 + %24 = load float** %_M_start.i.i, align 8, !tbaa !7 + %25 = bitcast float* %24 to i8* + call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1 + %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 + %26 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %27 = bitcast float* %26 to i8* + call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1 + %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %29 = bitcast float* %28 to i8* + call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7 + %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %cmp3399 = icmp eq float* %30, %31 + br i1 %cmp3399, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64 + %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64 + %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100 + store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3 + %inc = add i64 %i.0100, 1 + %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i + br i1 %cmp33, label %for.body, label %for.end + +for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %32 = load i32* %matArow, align 4, !tbaa !8 + %33 = load i32* %matBcol, align 4, !tbaa !8 + %34 = load i32* %matAcol, align 4, !tbaa !8 + %35 = load float** %_M_start.i.i, align 8, !tbaa !7 + %36 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 + %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %39 = bitcast float* %38 to i8* + call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 + %40 = load float** %_M_start.i.i, align 8, !tbaa !7 + %41 = bitcast float* %40 to i8* + call void @llvm_visc_untrack_mem(i8* %41) #1 + %42 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %43 = bitcast float* %42 to i8* + call void @llvm_visc_untrack_mem(i8* %43) #1 + %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %45 = bitcast float* %44 to i8* + call void @llvm_visc_untrack_mem(i8* %45) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 + %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 + call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) + call void @llvm.visc.cleanup() + %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 + %46 = load i8** %outFile, align 8, !tbaa !7 + %tobool = icmp eq i8* %46, null + br i1 %tobool, label %if.end45, label %if.then42 + +if.then42: ; preds = %for.end + %47 = load i32* %matArow, align 4, !tbaa !8 + %48 = load i32* %matBcol, align 4, !tbaa !8 + %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1 + br label %if.end45 + +if.end45: ; preds = %if.then42, %for.end + %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 + %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 + %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 + %49 = load i32* %matArow, align 4, !tbaa !8 + %conv50 = sitofp i32 %49 to double + %mul51 = fmul fast double %conv50, 2.000000e+00 + %50 = load i32* %matBcol, align 4, !tbaa !8 + %conv52 = sitofp i32 %50 to double + %mul53 = fmul fast double %mul51, %conv52 + %51 = load i32* %matAcol, align 4, !tbaa !8 + %conv54 = sitofp i32 %51 to double + %mul55 = fmul fast double %mul53, %conv54 + %div = fdiv fast double %mul55, %call48 + %div56 = fmul double %div, 1.000000e-09 + %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 + %52 = bitcast %"class.std::basic_ostream"* %call.i to i8** + %vtable.i = load i8** %52, align 8, !tbaa !6 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %53 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %53, align 8 + %54 = bitcast %"class.std::basic_ostream"* %call.i to i8* + %add.ptr.sum.i = add i64 %vbase.offset.i, 240 + %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i + %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** + %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7 + %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null + br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + +if.then.i.i.i: ; preds = %if.end45 + call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45 + %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6 + %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4 + %tobool.i3.i.i = icmp eq i8 %57, 0 + br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i + +if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10 + %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1 + %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6 + %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 + %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 + %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i + %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] + %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 + %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 + call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 + %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %tobool.i.i.i.i78 = icmp eq float* %61, null + br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 + +if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %62 = bitcast float* %61 to i8* + call void @_ZdlPv(i8* %62) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 + +_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %63 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %tobool.i.i.i.i74 = icmp eq float* %63, null + br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 + +if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %64 = bitcast float* %63 to i8* + call void @_ZdlPv(i8* %64) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 + +_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %65 = load float** %_M_start.i.i, align 8, !tbaa !7 + %tobool.i.i.i.i = icmp eq float* %65, null + br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i + +if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 + %66 = bitcast float* %65 to i8* + call void @_ZdlPv(i8* %66) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit + +_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 + call void @llvm.lifetime.end(i64 800, i8* %0) #1 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 + +; Function Attrs: noreturn nounwind +declare void @exit(i32) #4 + +declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 + +declare void @llvm_visc_track_mem(i8*, i64) #0 + +declare void @llvm_visc_request_mem(i8*, i64) #0 + +declare void @llvm_visc_untrack_mem(i8*) #0 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 + +declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 + +declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 + +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() #5 + +declare noalias i8* @_Znwm(i64) #0 + +; Function Attrs: nounwind +declare void @_ZdlPv(i8*) #6 + +declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 + +declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 + +; Function Attrs: noreturn +declare void @_ZSt16__throw_bad_castv() #5 + +declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 + +declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 + +; Function Attrs: nounwind +define internal void @_GLOBAL__I_a() #1 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 + ret void +} + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +declare i8* @llvm_visc_initializeTimerSet() + +declare void @llvm_visc_switchToTimer(i8**, i32) + +declare void @llvm_visc_printTimerSet(i8**, i8*) + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.init() #1 + +; Function Attrs: nounwind +declare void @llvm.visc.cleanup() #1 + +attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { noreturn nounwind } + +!visc_hint_gpu = !{!0, !1} +!visc_hint_cpu = !{!2} + +!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff} +!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1} +!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2} +!3 = metadata !{metadata !"float", metadata !4} +!4 = metadata !{metadata !"omnipotent char", metadata !5} +!5 = metadata !{metadata !"Simple C/C++ TBAA"} +!6 = metadata !{metadata !"vtable pointer", metadata !5} +!7 = metadata !{metadata !"any pointer", metadata !4} +!8 = metadata !{metadata !"int", metadata !4} +!9 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile new file mode 100644 index 0000000000..f74ee8921a --- /dev/null +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=visc +SRCDIR_OBJS=io.ll #compute_gold.o +VISC_OBJS=main.visc.ll +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc similarity index 91% rename from hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc rename to hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc index 8fbc45e08a..a4c252d8f1 100644 --- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc @@ -10,7 +10,6 @@ * Main entry of dense matrix-matrix multiplication kernel */ -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -20,6 +19,7 @@ #include <string.h> #include <sys/time.h> #include <vector> +#include <visc.h> // I/O routines extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, @@ -42,8 +42,8 @@ extern char *readFile(const char *); void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k, float alpha, float beta) { - __hpvm__hint(hpvm::SPIR_TARGET); - __hpvm__attributes(3, A, B, C, 1, C); + __visc__hint(visc::SPIR_TARGET); + __visc__attributes(3, A, B, C, 1, C); float c[TILE_N]; for (int i = 0; i < TILE_N; i++) @@ -135,10 +135,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m, unsigned db[2] = {TILE_N, TILE_TB_HEIGHT}; unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N}; - void *sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], + void *sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0], dg[1] / db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - __hpvm__wait(sgemmDFG); + __visc__wait(sgemmDFG); } int main(int argc, char *argv[]) { @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // copy A to device memory @@ -181,10 +181,10 @@ int main(int argc, char *argv[]) { // OpenCL memory allocation std::vector<float> matC(matArow * matBcol); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); - llvm_hpvm_track_mem(&matA.front(), A_sz); - llvm_hpvm_track_mem(&matBT.front(), B_sz); - llvm_hpvm_track_mem(&matC.front(), C_sz); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(&matA.front(), A_sz); + llvm_visc_track_mem(&matBT.front(), B_sz); + llvm_visc_track_mem(&matC.front(), C_sz); // Copy A and B^T into device memory pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -200,16 +200,16 @@ int main(int argc, char *argv[]) { matArow); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(&matC.front(), C_sz); + llvm_visc_request_mem(&matC.front(), C_sz); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(&matA.front()); - llvm_hpvm_untrack_mem(&matBT.front()); - llvm_hpvm_untrack_mem(&matC.front()); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); + llvm_visc_untrack_mem(&matA.front()); + llvm_visc_untrack_mem(&matBT.front()); + llvm_visc_untrack_mem(&matC.front()); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (params->outFile) { diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll new file mode 100644 index 0000000000..b997cf7ebc --- /dev/null +++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll @@ -0,0 +1,889 @@ +; ModuleID = 'build/visc_vec_opt_default/main.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"class.std::ios_base::Init" = type { i8 } +%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" } +%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* } +%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" } +%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 } +%"struct.std::ios_base::_Words" = type { i8*, i64 } +%"class.std::locale" = type { %"class.std::locale::_Impl"* } +%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** } +%"class.std::locale::facet" = type { i32 (...)**, i32 } +%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" } +%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 } +%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] } +%struct.__locale_data = type opaque +%"class.std::num_put" = type { %"class.std::locale::facet" } +%"class.std::num_get" = type { %"class.std::locale::facet" } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%rtype = type {} +%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }> +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } +%struct.pb_Parameters = type { i8*, i8** } + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@_ZSt4cerr = external global %"class.std::basic_ostream" +@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1 +@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1 +@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1 +@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1 +@stderr = external global %struct._IO_FILE* +@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1 +@_ZSt4cout = external global %"class.std::basic_ostream" +@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1 +@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] +@viscTimerSet_GenVISC = common global i8* null +@0 = internal constant [14 x i8] c"GenVISC_Timer\00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0 + +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode() + %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + +; %call = call i32 @get_local_id(i32 1) #2 + %call = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + +; %call1 = call i32 @get_local_size(i32 0) #2 + %call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + + %mul = mul i32 %call1, %call + +; %call2 = call i32 @get_local_id(i32 0) #2 + %call2 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node) + + %add = add i32 %mul, %call2 + +; %call3 = call i32 @get_group_id(i32 0) #2 + %call3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + + %mul4 = shl i32 %call3, 6 + %add5 = add i32 %add, %mul4 + %cmp89 = icmp sgt i32 %k, 0 + + %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %mul7 = shl i32 %call6, 3 + + br i1 %cmp89, label %for.body, label %for.end23 + +for.body: ; preds = %entry, %for.inc21 + %cp.091 = phi <8 x float> [ %add20, %for.inc21 ], [ zeroinitializer, %entry ] + %i.090 = phi i32 [ %add22, %for.inc21 ], [ 0, %entry ] +; %call6 = call i32 @get_group_id(i32 1) #2 +; %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) +; %mul7 = shl i32 %call6, 3 + br label %for.body12 + +for.body12: ; preds = %for.body12, %for.body + %cp.188 = phi <8 x float> [ %cp.091, %for.body ], [ %add20, %for.body12 ] + %j.087 = phi i32 [ 0, %for.body ], [ %inc, %for.body12 ] + %add13 = add i32 %j.087, %i.090 + %mul14 = mul nsw i32 %add13, %lda + %add15 = add nsw i32 %mul14, %add5 + %arrayidx = getelementptr inbounds float* %A, i32 %add15 + %0 = load float* %arrayidx, align 4, !tbaa !3 + %splat.splatinsert = insertelement <8 x float> undef, float %0, i32 0 + %splat.splat = shufflevector <8 x float> %splat.splatinsert, <8 x float> undef, <8 x i32> zeroinitializer + %tmp83 = mul i32 %add13, %ldb + %add.ptr.sum = add i32 %tmp83, %mul7 + %add.ptr17 = getelementptr inbounds float* %B, i32 %add.ptr.sum + +; %call18 = call <8 x float> @_Z6vload8jPKU3AS1f(i32 0, float* %add.ptr17) #2 + %add.ptr17.cast = bitcast float* %add.ptr17 to <8 x float>* + %call18 = load <8 x float>* %add.ptr17.cast, align 8 + + %mul19 = fmul fast <8 x float> %call18, %splat.splat + %add20 = fadd fast <8 x float> %cp.188, %mul19 + %inc = add nsw i32 %j.087, 1 + %exitcond92 = icmp eq i32 %inc, 8 + br i1 %exitcond92, label %for.inc21, label %for.body12 + +for.inc21: ; preds = %for.body12 + %add22 = add nsw i32 %i.090, 8 + %cmp = icmp slt i32 %add22, %k + br i1 %cmp, label %for.body, label %for.end23 + +for.end23: ; preds = %for.inc21, %entry + %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %entry ], [ %add20, %for.inc21 ] + %splat.splatinsert24 = insertelement <8 x float> undef, float %alpha, i32 0 + %splat.splat25 = shufflevector <8 x float> %splat.splatinsert24, <8 x float> undef, <8 x i32> zeroinitializer + %mul26 = fmul fast <8 x float> %splat.splat25, %cp.0.lcssa + %1 = extractelement <8 x float> %mul26, i32 0 + %2 = extractelement <8 x float> %mul26, i32 1 + %3 = extractelement <8 x float> %mul26, i32 2 + %4 = extractelement <8 x float> %mul26, i32 3 + %5 = extractelement <8 x float> %mul26, i32 4 + %6 = extractelement <8 x float> %mul26, i32 5 + %7 = extractelement <8 x float> %mul26, i32 6 + %8 = extractelement <8 x float> %mul26, i32 7 +; %call35 = call i32 @get_group_id(i32 1) #2 +; %call35 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode) + %mul37 = shl i32 %call6, 3 + %tmp85 = mul i32 %mul37, %ldc + %add44 = add i32 %tmp85, %add5 + %arrayidx45 = getelementptr inbounds float* %C, i32 %add44 + %9 = load float* %arrayidx45, align 4, !tbaa !3 + %mul46 = fmul fast float %9, %beta + %add48 = fadd fast float %mul46, %1 + store float %add48, float* %arrayidx45, align 4, !tbaa !3 + %tmp84.193 = or i32 %mul37, 1 + %tmp85.1 = mul i32 %tmp84.193, %ldc + %add44.1 = add i32 %tmp85.1, %add5 + %arrayidx45.1 = getelementptr inbounds float* %C, i32 %add44.1 + %10 = load float* %arrayidx45.1, align 4, !tbaa !3 + %mul46.1 = fmul fast float %10, %beta + %add48.1 = fadd fast float %mul46.1, %2 + store float %add48.1, float* %arrayidx45.1, align 4, !tbaa !3 + %tmp84.294 = or i32 %mul37, 2 + %tmp85.2 = mul i32 %tmp84.294, %ldc + %add44.2 = add i32 %tmp85.2, %add5 + %arrayidx45.2 = getelementptr inbounds float* %C, i32 %add44.2 + %11 = load float* %arrayidx45.2, align 4, !tbaa !3 + %mul46.2 = fmul fast float %11, %beta + %add48.2 = fadd fast float %mul46.2, %3 + store float %add48.2, float* %arrayidx45.2, align 4, !tbaa !3 + %tmp84.395 = or i32 %mul37, 3 + %tmp85.3 = mul i32 %tmp84.395, %ldc + %add44.3 = add i32 %tmp85.3, %add5 + %arrayidx45.3 = getelementptr inbounds float* %C, i32 %add44.3 + %12 = load float* %arrayidx45.3, align 4, !tbaa !3 + %mul46.3 = fmul fast float %12, %beta + %add48.3 = fadd fast float %mul46.3, %4 + store float %add48.3, float* %arrayidx45.3, align 4, !tbaa !3 + %tmp84.496 = or i32 %mul37, 4 + %tmp85.4 = mul i32 %tmp84.496, %ldc + %add44.4 = add i32 %tmp85.4, %add5 + %arrayidx45.4 = getelementptr inbounds float* %C, i32 %add44.4 + %13 = load float* %arrayidx45.4, align 4, !tbaa !3 + %mul46.4 = fmul fast float %13, %beta + %add48.4 = fadd fast float %mul46.4, %5 + store float %add48.4, float* %arrayidx45.4, align 4, !tbaa !3 + %tmp84.597 = or i32 %mul37, 5 + %tmp85.5 = mul i32 %tmp84.597, %ldc + %add44.5 = add i32 %tmp85.5, %add5 + %arrayidx45.5 = getelementptr inbounds float* %C, i32 %add44.5 + %14 = load float* %arrayidx45.5, align 4, !tbaa !3 + %mul46.5 = fmul fast float %14, %beta + %add48.5 = fadd fast float %mul46.5, %6 + store float %add48.5, float* %arrayidx45.5, align 4, !tbaa !3 + %tmp84.698 = or i32 %mul37, 6 + %tmp85.6 = mul i32 %tmp84.698, %ldc + %add44.6 = add i32 %tmp85.6, %add5 + %arrayidx45.6 = getelementptr inbounds float* %C, i32 %add44.6 + %15 = load float* %arrayidx45.6, align 4, !tbaa !3 + %mul46.6 = fmul fast float %15, %beta + %add48.6 = fadd fast float %mul46.6, %7 + store float %add48.6, float* %arrayidx45.6, align 4, !tbaa !3 + %tmp84.799 = or i32 %mul37, 7 + %tmp85.7 = mul i32 %tmp84.799, %ldc + %add44.7 = add i32 %tmp85.7, %add5 + %arrayidx45.7 = getelementptr inbounds float* %C, i32 %add44.7 + %16 = load float* %arrayidx45.7, align 4, !tbaa !3 + %mul46.7 = fmul fast float %16, %beta + %add48.7 = fadd fast float %mul46.7, %8 + store float %add48.7, float* %arrayidx45.7, align 4, !tbaa !3 + + + ret %rtype undef +} + +; Function Attrs: noinline nounwind uwtable +define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 { +entry: + switch i8 %transa, label %if.then [ + i8 78, label %if.end + i8 110, label %if.end + ] + +if.then: ; preds = %entry + %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1 + %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %0 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %0, align 8 + %add.ptr.i.sum = add i64 %vbase.offset.i, 240 + %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum + %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"** + %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7 + %tobool.i93 = icmp eq %"class.std::ctype"* %2, null + br i1 %tobool.i93, label %if.then.i94, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + +if.then.i94: ; preds = %if.then + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %if.then + %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6 + %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4 + %tobool.i = icmp eq i8 %3, 0 + br i1 %tobool.i, label %if.end.i, label %if.then.i + +if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10 + %4 = load i8* %arrayidx.i, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1 + %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i67 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6 + %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i67, i64 6 + %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8 + %call.i68 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit + +_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.end.i, %if.then.i + %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i68, %if.end.i ] + %call1.i43 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1 + %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i43) #1 + br label %return + +if.end: ; preds = %entry, %entry + switch i8 %transb, label %if.then9 [ + i8 84, label %if.end12 + i8 116, label %if.end12 + ] + +if.then9: ; preds = %if.end + %call1.i45 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1 + %vtable.i47 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6 + %vbase.offset.ptr.i48 = getelementptr i8* %vtable.i47, i64 -24 + %7 = bitcast i8* %vbase.offset.ptr.i48 to i64* + %vbase.offset.i49 = load i64* %7, align 8 + %add.ptr.i50.sum = add i64 %vbase.offset.i49, 240 + %_M_ctype.i69 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i50.sum + %8 = bitcast i8* %_M_ctype.i69 to %"class.std::ctype"** + %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7 + %tobool.i96 = icmp eq %"class.std::ctype"* %9, null + br i1 %tobool.i96, label %if.then.i97, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 + +if.then.i97: ; preds = %if.then9 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99: ; preds = %if.then9 + %_M_widen_ok.i71 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6 + %10 = load i8* %_M_widen_ok.i71, align 1, !tbaa !4 + %tobool.i72 = icmp eq i8 %10, 0 + br i1 %tobool.i72, label %if.end.i78, label %if.then.i74 + +if.then.i74: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 + %arrayidx.i73 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10 + %11 = load i8* %arrayidx.i73, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit80 + +if.end.i78: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1 + %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i75 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6 + %vfn.i76 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i75, i64 6 + %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i76, align 8 + %call.i77 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit80 + +_ZNKSt5ctypeIcE5widenEc.exit80: ; preds = %if.end.i78, %if.then.i74 + %retval.0.i79 = phi i8 [ %11, %if.then.i74 ], [ %call.i77, %if.end.i78 ] + %call1.i52 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i79) #1 + %call.i53 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i52) #1 + br label %return + +if.end12: ; preds = %if.end, %if.end + %rem40 = and i32 %m, 63 + %tobool = icmp eq i32 %rem40, 0 + br i1 %tobool, label %lor.lhs.false, label %if.then15 + +lor.lhs.false: ; preds = %if.end12 + %rem1341 = and i32 %n, 7 + %tobool14 = icmp eq i32 %rem1341, 0 + br i1 %tobool14, label %if.end21, label %if.then15 + +if.then15: ; preds = %lor.lhs.false, %if.end12 + %call1.i55 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1 + %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 64) #1 + %call1.i57 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1 + %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 8) #1 + %14 = bitcast %"class.std::basic_ostream"* %call19 to i8** + %vtable.i59 = load i8** %14, align 8, !tbaa !6 + %vbase.offset.ptr.i60 = getelementptr i8* %vtable.i59, i64 -24 + %15 = bitcast i8* %vbase.offset.ptr.i60 to i64* + %vbase.offset.i61 = load i64* %15, align 8 + %16 = bitcast %"class.std::basic_ostream"* %call19 to i8* + %add.ptr.i62.sum = add i64 %vbase.offset.i61, 240 + %_M_ctype.i81 = getelementptr inbounds i8* %16, i64 %add.ptr.i62.sum + %17 = bitcast i8* %_M_ctype.i81 to %"class.std::ctype"** + %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7 + %tobool.i100 = icmp eq %"class.std::ctype"* %18, null + br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + +if.then.i101: ; preds = %if.then15 + tail call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then15 + %_M_widen_ok.i83 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6 + %19 = load i8* %_M_widen_ok.i83, align 1, !tbaa !4 + %tobool.i84 = icmp eq i8 %19, 0 + br i1 %tobool.i84, label %if.end.i90, label %if.then.i86 + +if.then.i86: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + %arrayidx.i85 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10 + %20 = load i8* %arrayidx.i85, align 1, !tbaa !4 + br label %_ZNKSt5ctypeIcE5widenEc.exit92 + +if.end.i90: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103 + tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1 + %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i87 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6 + %vfn.i88 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i87, i64 6 + %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i88, align 8 + %call.i89 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1 + br label %_ZNKSt5ctypeIcE5widenEc.exit92 + +_ZNKSt5ctypeIcE5widenEc.exit92: ; preds = %if.end.i90, %if.then.i86 + %retval.0.i91 = phi i8 [ %20, %if.then.i86 ], [ %call.i89, %if.end.i90 ] + %call1.i64 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i91) #1 + %call.i65 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i64) #1 + br label %return + +if.end21: ; preds = %lor.lhs.false + %mul = shl nsw i32 %m, 3 + %div = sdiv i32 %mul, 64 + %div27 = lshr i32 %div, 3 + %div30 = lshr i32 %n, 3 + %conv31 = fpext float %alpha to double + %conv32 = fpext float %beta to double + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) + %in.addr = alloca %struct.arg + %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + store float* %A, float** %in.addr.A + %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + store i64 %bytesA, i64* %in.addr.bytes_A + %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + store i32 %lda, i32* %in.addr.lda + %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + store float* %B, float** %in.addr.B + %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + store i64 %bytesB, i64* %in.addr.bytes_B + %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + store i32 %ldb, i32* %in.addr.ldb + %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + store float* %C, float** %in.addr.C + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + store i64 %bytesC, i64* %in.addr.bytes_C + %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store i32 %ldc, i32* %in.addr.ldc + %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + store i32 %k, i32* %in.addr.k + %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10 + %in.addr.alpha.cast = fptrunc double %conv31 to float + store float %in.addr.alpha.cast, float* %in.addr.alpha + %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11 + %in.addr.beta.cast = fptrunc double %conv32 to float + store float %in.addr.beta.cast, float* %in.addr.beta + %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 + store i32 8, i32* %in.addr.dimX0 + %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 + store i32 8, i32* %in.addr.dimY0 + %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 + store i32 %div27, i32* %in.addr.dimX1 + %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15 + store i32 %div30, i32* %in.addr.dimY1 + %args = bitcast %struct.arg* %in.addr to i8* + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args) + call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2) + br label %return + +return: ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit92, %_ZNKSt5ctypeIcE5widenEc.exit80, %_ZNKSt5ctypeIcE5widenEc.exit + ret void +} + +declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0 + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #2 { +entry: + %argc.addr = alloca i32, align 4 + %timers = alloca %struct.pb_TimerSet, align 8 + %matArow = alloca i32, align 4 + %matAcol = alloca i32, align 4 + %matBrow = alloca i32, align 4 + %matBcol = alloca i32, align 4 + %matA = alloca %"class.std::vector", align 8 + %matBT = alloca %"class.std::vector", align 8 + %matC = alloca %"class.std::vector", align 8 + store i32 %argc, i32* %argc.addr, align 4, !tbaa !8 + %0 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 800, i8* %0) #1 + %1 = bitcast %"class.std::vector"* %matA to i8* + call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1 + %2 = bitcast %"class.std::vector"* %matBT to i8* + call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1 + %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 + %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1 + %3 = load i8*** %inpFiles, align 8, !tbaa !7 + %4 = load i8** %3, align 8, !tbaa !7 + %cmp = icmp eq i8* %4, null + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %arrayidx2 = getelementptr inbounds i8** %3, i64 1 + %5 = load i8** %arrayidx2, align 8, !tbaa !7 + %cmp3 = icmp eq i8* %5, null + br i1 %cmp3, label %if.then, label %lor.lhs.false4 + +lor.lhs.false4: ; preds = %lor.lhs.false + %arrayidx6 = getelementptr inbounds i8** %3, i64 2 + %6 = load i8** %arrayidx6, align 8, !tbaa !7 + %cmp7 = icmp eq i8* %6, null + br i1 %cmp7, label %if.then, label %lor.lhs.false8 + +lor.lhs.false8: ; preds = %lor.lhs.false4 + %arrayidx10 = getelementptr inbounds i8** %3, i64 3 + %7 = load i8** %arrayidx10, align 8, !tbaa !7 + %cmp11 = icmp eq i8* %7, null + br i1 %cmp11, label %if.end, label %if.then + +if.then: ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry + %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7 + %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8) + call void @exit(i32 -1) #7 + unreachable + +if.end: ; preds = %lor.lhs.false8 + %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1 + %10 = load i8*** %inpFiles, align 8, !tbaa !7 + %arrayidx17 = getelementptr inbounds i8** %10, i64 2 + %11 = load i8** %arrayidx17, align 8, !tbaa !7 + %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 + %12 = call i8* @llvm_visc_initializeTimerSet() + store i8* %12, i8** @viscTimerSet_GenVISC + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + call void @llvm.visc.init() + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %13 = load i32* %matArow, align 4, !tbaa !8 + %14 = load i32* %matAcol, align 4, !tbaa !8 + %mul = mul nsw i32 %14, %13 + %conv = sext i32 %mul to i64 + %mul19 = shl nsw i64 %conv, 2 + %15 = load i32* %matBrow, align 4, !tbaa !8 + %16 = load i32* %matBcol, align 4, !tbaa !8 + %mul20 = mul nsw i32 %16, %15 + %conv21 = sext i32 %mul20 to i64 + %mul22 = shl nsw i64 %conv21, 2 + %mul23 = mul nsw i32 %16, %13 + %conv24 = sext i32 %mul23 to i64 + %mul25 = shl nsw i64 %conv24, 2 + %17 = bitcast %"class.std::vector"* %matC to i8* + call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1 + %cmp.i.i.i.i = icmp eq i32 %mul23, 0 + br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i + +cond.true.i.i.i.i: ; preds = %if.end + %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9 + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i + call void @_ZSt17__throw_bad_allocv() #7 + unreachable + +_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i + %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1 + %18 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i: ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end + %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ] + %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0 + store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7 + %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1 + store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 + %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24 + %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2 + store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7 + br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i + +for.body.lr.ph.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7 + %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i + %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24 + %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i + br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +vector.body.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ] + %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i + %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %19, align 4 + %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4 + %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i + %21 = bitcast float* %20 to <4 x float>* + store <4 x float> zeroinitializer, <4 x float>* %21, align 4 + %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8 + %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i + br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i + +middle.block.i.i.i.i.i.i.i.i: ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ] + %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24 + br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader + +for.body.i.i.i.i.i.i.i.i.preheader: ; preds = %middle.block.i.i.i.i.i.i.i.i + %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8* + %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2 + call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false) + br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + +_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit: ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i + store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7 + %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0 + %24 = load float** %_M_start.i.i, align 8, !tbaa !7 + %25 = bitcast float* %24 to i8* + call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1 + %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0 + %26 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %27 = bitcast float* %26 to i8* + call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1 + %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %29 = bitcast float* %28 to i8* + call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7 + %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %cmp3399 = icmp eq float* %30, %31 + br i1 %cmp3399, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64 + %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64 + %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100 + store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3 + %inc = add i64 %i.0100, 1 + %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i + br i1 %cmp33, label %for.body, label %for.end + +for.end: ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + %32 = load i32* %matArow, align 4, !tbaa !8 + %33 = load i32* %matBcol, align 4, !tbaa !8 + %34 = load i32* %matAcol, align 4, !tbaa !8 + %35 = load float** %_M_start.i.i, align 8, !tbaa !7 + %36 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 + %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %39 = bitcast float* %38 to i8* + call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 + %40 = load float** %_M_start.i.i, align 8, !tbaa !7 + %41 = bitcast float* %40 to i8* + call void @llvm_visc_untrack_mem(i8* %41) #1 + %42 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %43 = bitcast float* %42 to i8* + call void @llvm_visc_untrack_mem(i8* %43) #1 + %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %45 = bitcast float* %44 to i8* + call void @llvm_visc_untrack_mem(i8* %45) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 + %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 + call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) + call void @llvm.visc.cleanup() + %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0 + %46 = load i8** %outFile, align 8, !tbaa !7 + %tobool = icmp eq i8* %46, null + br i1 %tobool, label %if.end45, label %if.then42 + +if.then42: ; preds = %for.end + %47 = load i32* %matArow, align 4, !tbaa !8 + %48 = load i32* %matBcol, align 4, !tbaa !8 + %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1 + br label %if.end45 + +if.end45: ; preds = %if.then42, %for.end + %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2 + %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1 + %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1 + %49 = load i32* %matArow, align 4, !tbaa !8 + %conv50 = sitofp i32 %49 to double + %mul51 = fmul fast double %conv50, 2.000000e+00 + %50 = load i32* %matBcol, align 4, !tbaa !8 + %conv52 = sitofp i32 %50 to double + %mul53 = fmul fast double %mul51, %conv52 + %51 = load i32* %matAcol, align 4, !tbaa !8 + %conv54 = sitofp i32 %51 to double + %mul55 = fmul fast double %mul53, %conv54 + %div = fdiv fast double %mul55, %call48 + %div56 = fmul double %div, 1.000000e-09 + %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1 + %52 = bitcast %"class.std::basic_ostream"* %call.i to i8** + %vtable.i = load i8** %52, align 8, !tbaa !6 + %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24 + %53 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64* %53, align 8 + %54 = bitcast %"class.std::basic_ostream"* %call.i to i8* + %add.ptr.sum.i = add i64 %vbase.offset.i, 240 + %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i + %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"** + %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7 + %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null + br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + +if.then.i.i.i: ; preds = %if.end45 + call void @_ZSt16__throw_bad_castv() #7 + unreachable + +_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45 + %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6 + %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4 + %tobool.i3.i.i = icmp eq i8 %57, 0 + br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i + +if.then.i4.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10 + %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +if.end.i.i.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i + call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1 + %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)*** + %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6 + %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6 + %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8 + %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1 + br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + +_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i + %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ] + %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1 + %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1 + call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1 + %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7 + %tobool.i.i.i.i78 = icmp eq float* %61, null + br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79 + +if.then.i.i.i.i79: ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %62 = bitcast float* %61 to i8* + call void @_ZdlPv(i8* %62) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit80 + +_ZNSt6vectorIfSaIfEED1Ev.exit80: ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit + %63 = load float** %_M_start.i.i82, align 8, !tbaa !7 + %tobool.i.i.i.i74 = icmp eq float* %63, null + br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75 + +if.then.i.i.i.i75: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %64 = bitcast float* %63 to i8* + call void @_ZdlPv(i8* %64) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit76 + +_ZNSt6vectorIfSaIfEED1Ev.exit76: ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80 + %65 = load float** %_M_start.i.i, align 8, !tbaa !7 + %tobool.i.i.i.i = icmp eq float* %65, null + br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i + +if.then.i.i.i.i: ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76 + %66 = bitcast float* %65 to i8* + call void @_ZdlPv(i8* %66) #1 + br label %_ZNSt6vectorIfSaIfEED1Ev.exit + +_ZNSt6vectorIfSaIfEED1Ev.exit: ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76 + call void @llvm.lifetime.end(i64 800, i8* %0) #1 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0 + +; Function Attrs: noreturn nounwind +declare void @exit(i32) #4 + +declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 + +declare void @llvm_visc_track_mem(i8*, i64) #0 + +declare void @llvm_visc_request_mem(i8*, i64) #0 + +declare void @llvm_visc_untrack_mem(i8*) #0 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 + +declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0 + +declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0 + +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() #5 + +declare noalias i8* @_Znwm(i64) #0 + +; Function Attrs: nounwind +declare void @_ZdlPv(i8*) #6 + +declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0 + +declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0 + +; Function Attrs: noreturn +declare void @_ZSt16__throw_bad_castv() #5 + +declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0 + +declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0 + +; Function Attrs: nounwind +define internal void @_GLOBAL__I_a() #1 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1 + ret void +} + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +declare i8* @llvm_visc_initializeTimerSet() + +declare void @llvm_visc_switchToTimer(i8**, i32) + +declare void @llvm_visc_printTimerSet(i8**, i8*) + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 { +entry: + %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12) + call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.init() #1 + +; Function Attrs: nounwind +declare void @llvm.visc.cleanup() #1 + +attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { noreturn nounwind } + +!visc_hint_gpu = !{!0, !1} +!visc_hint_cpu = !{!2} + +!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff} +!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1} +!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2} +!3 = metadata !{metadata !"float", metadata !4} +!4 = metadata !{metadata !"omnipotent char", metadata !5} +!5 = metadata !{metadata !"Simple C/C++ TBAA"} +!6 = metadata !{metadata !"vtable pointer", metadata !5} +!7 = metadata !{metadata !"any pointer", metadata !4} +!8 = metadata !{metadata !"int", metadata !4} +!9 = metadata !{metadata !"branch_weights", i32 4, i32 64} diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile index aff3e54712..23e1d49900 100644 --- a/hpvm/test/parboil/benchmarks/spmv/Makefile +++ b/hpvm/test/parboil/benchmarks/spmv/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = spmv -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c index 8bff8a1d0a..a19184a965 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int j = 0; j < 20; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c index f704f96ed2..d4fc026b73 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c @@ -236,7 +236,7 @@ int main(int argc, char **argv) { // main execution int i; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); /*for(int j=0; j<20; j++) {*/ for (i = 0; i < 50; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c index a6fe5012f9..42ffab597d 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int j = 0; j < 1; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c index bc3655c4ab..fbd272b32f 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // parameters declaration cl_int clStatus; @@ -127,7 +127,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/ /*cl_program clProgram = * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/ @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is // %d=\n",grid,block); printf("!!! dim is %d\n",dim); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -240,7 +240,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int i; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int j = 0; j < 20; j++) { for (i = 0; i < 50; i++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c index 88fd0c878b..343814149a 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c @@ -224,7 +224,7 @@ int main(int argc, char **argv) { // main execution - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int i; for (i = 0; i < 50; i++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid, diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c index ca538e3a95..4600a3e6b8 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c @@ -83,7 +83,7 @@ int main(int argc, char **argv) { printf("Col count = %d, dim = %d\n", col_count, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; @@ -137,7 +137,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCLDeviceProp clDeviceProp; clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, @@ -215,7 +215,7 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, clDeviceProp.minor, clDeviceProp.multiProcessorCount); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -237,7 +237,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int i; for (int j = 0; j < 5; j++) { for (i = 0; i < 50; i++) { @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c index 21973c2fa7..d2375af91d 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c +++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c @@ -83,7 +83,7 @@ int main(int argc, char **argv) { printf("Col count = %d, dim = %d\n", col_count, dim); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; @@ -137,7 +137,7 @@ int main(int argc, char **argv) { cl_mem jds_ptr_int; cl_mem sh_zcnt_int; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); OpenCLDeviceProp clDeviceProp; clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, @@ -215,7 +215,7 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major, clDeviceProp.minor, clDeviceProp.multiProcessorCount); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector); CHECK_ERROR("clSetKernelArg") clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data); @@ -237,7 +237,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int i; for (int j = 0; j < 100; j++) { for (i = 0; i < 50; i++) { @@ -260,7 +260,7 @@ int main(int argc, char **argv) { dim * sizeof(float), h_Ax_vector, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseKernel(clKernel); clStatus = clReleaseProgram(clProgram); diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile similarity index 88% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile index 06af6bebea..a289d68f34 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile @@ -1,9 +1,9 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc TOOLS_SRC=common_src/convert-dataset SRCDIR_OBJS=gpu_info.ll file.ll -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O1 -I$(TOOLS_SRC) APP_CXXFLAGS=-ffast-math -O1 -I$(TOOLS_SRC) diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h rename to hpvm/test/parboil/benchmarks/spmv/src/visc/file.h diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h rename to hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl rename to hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp similarity index 68% rename from hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp rename to hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp index 4414744b49..f6ce5ccfb2 100644 --- a/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp @@ -8,11 +8,11 @@ //#include <CL/cl.h> //#include <CL/cl_ext.h> -#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <visc.h> #include "convert_dataset.h" #include "file.h" @@ -54,15 +54,15 @@ void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __visc__hint(visc::DEVICE); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int gridx = __hpvm__getNumNodeInstances_x(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gridx = __visc__getNumNodeInstances_x(thisNode); int ix = gx * gridx + lx; int warp_id = ix >> WARP_BITS; @@ -126,25 +126,25 @@ void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __visc__hint(visc::DEVICE); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __hpvm__createNodeND(1, spmv_jds, dim_X1); - __hpvm__bindIn(spmv_node, 0, 0, 0); - __hpvm__bindIn(spmv_node, 1, 1, 0); - __hpvm__bindIn(spmv_node, 2, 2, 0); - __hpvm__bindIn(spmv_node, 3, 3, 0); - __hpvm__bindIn(spmv_node, 4, 4, 0); - __hpvm__bindIn(spmv_node, 5, 5, 0); - __hpvm__bindIn(spmv_node, 6, 6, 0); - __hpvm__bindIn(spmv_node, 7, 7, 0); - __hpvm__bindIn(spmv_node, 8, 8, 0); - __hpvm__bindIn(spmv_node, 9, 9, 0); - __hpvm__bindIn(spmv_node, 10, 10, 0); - __hpvm__bindIn(spmv_node, 11, 11, 0); - __hpvm__bindIn(spmv_node, 12, 12, 0); - __hpvm__bindIn(spmv_node, 13, 13, 0); - __hpvm__bindIn(spmv_node, 14, 14, 0); + void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); } void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data, @@ -153,26 +153,26 @@ void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __hpvm__createNodeND(1, spmvLvl1, dim_X2); - __hpvm__bindIn(spmv_node, 0, 0, 0); - __hpvm__bindIn(spmv_node, 1, 1, 0); - __hpvm__bindIn(spmv_node, 2, 2, 0); - __hpvm__bindIn(spmv_node, 3, 3, 0); - __hpvm__bindIn(spmv_node, 4, 4, 0); - __hpvm__bindIn(spmv_node, 5, 5, 0); - __hpvm__bindIn(spmv_node, 6, 6, 0); - __hpvm__bindIn(spmv_node, 7, 7, 0); - __hpvm__bindIn(spmv_node, 8, 8, 0); - __hpvm__bindIn(spmv_node, 9, 9, 0); - __hpvm__bindIn(spmv_node, 10, 10, 0); - __hpvm__bindIn(spmv_node, 11, 11, 0); - __hpvm__bindIn(spmv_node, 12, 12, 0); - __hpvm__bindIn(spmv_node, 13, 13, 0); - __hpvm__bindIn(spmv_node, 14, 14, 0); - __hpvm__bindIn(spmv_node, 15, 15, 0); + void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); + __visc__bindIn(spmv_node, 15, 15, 0); } void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data, @@ -181,27 +181,27 @@ void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data, size_t bytes_x_vec, int dim, int *jds_ptr_int, size_t bytes_jds_ptr_int, int *sh_zcnt_int, size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, + __visc__hint(visc::CPU_TARGET); + __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int, 1, dst_vector); - void *spmv_node = __hpvm__createNodeND(1, spmvLvl2, dim_X2); - __hpvm__bindIn(spmv_node, 0, 0, 0); - __hpvm__bindIn(spmv_node, 1, 1, 0); - __hpvm__bindIn(spmv_node, 2, 2, 0); - __hpvm__bindIn(spmv_node, 3, 3, 0); - __hpvm__bindIn(spmv_node, 4, 4, 0); - __hpvm__bindIn(spmv_node, 5, 5, 0); - __hpvm__bindIn(spmv_node, 6, 6, 0); - __hpvm__bindIn(spmv_node, 7, 7, 0); - __hpvm__bindIn(spmv_node, 8, 8, 0); - __hpvm__bindIn(spmv_node, 9, 9, 0); - __hpvm__bindIn(spmv_node, 10, 10, 0); - __hpvm__bindIn(spmv_node, 11, 11, 0); - __hpvm__bindIn(spmv_node, 12, 12, 0); - __hpvm__bindIn(spmv_node, 13, 13, 0); - __hpvm__bindIn(spmv_node, 14, 14, 0); - __hpvm__bindIn(spmv_node, 15, 15, 0); - __hpvm__bindIn(spmv_node, 16, 16, 0); + void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2); + __visc__bindIn(spmv_node, 0, 0, 0); + __visc__bindIn(spmv_node, 1, 1, 0); + __visc__bindIn(spmv_node, 2, 2, 0); + __visc__bindIn(spmv_node, 3, 3, 0); + __visc__bindIn(spmv_node, 4, 4, 0); + __visc__bindIn(spmv_node, 5, 5, 0); + __visc__bindIn(spmv_node, 6, 6, 0); + __visc__bindIn(spmv_node, 7, 7, 0); + __visc__bindIn(spmv_node, 8, 8, 0); + __visc__bindIn(spmv_node, 9, 9, 0); + __visc__bindIn(spmv_node, 10, 10, 0); + __visc__bindIn(spmv_node, 11, 11, 0); + __visc__bindIn(spmv_node, 12, 12, 0); + __visc__bindIn(spmv_node, 13, 13, 0); + __visc__bindIn(spmv_node, 14, 14, 0); + __visc__bindIn(spmv_node, 15, 15, 0); + __visc__bindIn(spmv_node, 16, 16, 0); } int main(int argc, char **argv) { @@ -261,7 +261,7 @@ int main(int argc, char **argv) { input_vec(parameters->inpFiles[1], h_x_vector, dim); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memset(h_Ax_vector, 0, dim * sizeof(float)); @@ -271,14 +271,14 @@ int main(int argc, char **argv) { compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); - llvm_hpvm_track_mem(h_Ax_vector, dim * sizeof(float)); - llvm_hpvm_track_mem(h_data, len * sizeof(float)); - llvm_hpvm_track_mem(h_indices, len * sizeof(int)); - llvm_hpvm_track_mem(h_perm, dim * sizeof(int)); - llvm_hpvm_track_mem(h_x_vector, dim * sizeof(float)); - llvm_hpvm_track_mem(h_ptr, depth * sizeof(int)); - llvm_hpvm_track_mem(h_nzcnt, nzcnt_len * sizeof(int)); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float)); + llvm_visc_track_mem(h_data, len * sizeof(float)); + llvm_visc_track_mem(h_indices, len * sizeof(int)); + llvm_visc_track_mem(h_perm, dim * sizeof(int)); + llvm_visc_track_mem(h_x_vector, dim * sizeof(float)); + llvm_visc_track_mem(h_ptr, depth * sizeof(int)); + llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int)); // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -306,9 +306,9 @@ int main(int argc, char **argv) { block, (grid / block)}; *(RootIn *)root_in = root_in_local; - void *spmvDFG = __hpvm__launch(0, spmvLvl3, root_in); + void *spmvDFG = __visc__launch(0, spmvLvl3, root_in); - __hpvm__wait(spmvDFG); + __visc__wait(spmvDFG); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); /******************************* Issues ******************************* @@ -326,21 +326,21 @@ int main(int argc, char **argv) { // HtoD memory copy pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(h_Ax_vector, dim * sizeof(float)); + llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float)); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(h_Ax_vector); - llvm_hpvm_untrack_mem(h_data); - llvm_hpvm_untrack_mem(h_indices); - llvm_hpvm_untrack_mem(h_perm); - llvm_hpvm_untrack_mem(h_x_vector); - llvm_hpvm_untrack_mem(h_ptr); - llvm_hpvm_untrack_mem(h_nzcnt); + llvm_visc_untrack_mem(h_Ax_vector); + llvm_visc_untrack_mem(h_data); + llvm_visc_untrack_mem(h_indices); + llvm_visc_untrack_mem(h_perm); + llvm_visc_untrack_mem(h_x_vector); + llvm_visc_untrack_mem(h_ptr); + llvm_visc_untrack_mem(h_nzcnt); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc new file mode 100644 index 0000000000000000000000000000000000000000..b804d14d16cff805c0c1850d1f5079ab6e973ecf GIT binary patch literal 2464 zcmZuxe^3+o75{D$vH>>9MuH}iFdG9jwK^^V5d<W^&`|M+=&6WPH~f-zL<497oM$&b zND$)@m9uf~I+L{YhCi+yYfoxtv=X8mMYu!2pXXQ*j?=dG@E6somVP(!tlo6zyKmq7 z?t7p2dGGCab^6{r3Jid1769=M0>B4Y0B_W;JgObw6mfBN&N0M=rN;22$(+L#w78I^ zI~K3&!qwtzo}?&FZAdZ3`4q~Fr8Fj0G^C&dMo~p~pj*>Fi$I)>5&-n@nKZlP**v@u zKhLW2H2XbSG}^54)EhleW|3%9t;a&3y|O;Ld?qLZeeyd&{*X4gFDM@j@+T(cqapdd zplmwGAGFIygL2TwL#=*K1K`=3nDAzv2f}929-9%h1w0KtPnHU`8a>upwyg=Z)w1DR z3;8{Kh&g0VzSS$62=Z?!WcLs`Xz_VkR48rnzz`b`wLzW?H>Z_{wrEf~c{-Rph!B4Z z$!8R@p`h&l#db$3@_TF~+EUAIq&yI(1q>-<H_R{=wgMEvo-j&+1As#W08*%{ml$J- zs#ebNC0#Bkrd&avctLTyH{R7%j2o381BhpM8uJGLYZLR@6l<l`ZQ4PVvslDcWa5^> zOta8VFIb*_QT1yq-6*ZbD+;$)v~64~t?;P_3D<3zmW%-jpElE;eLgfd;LPeqZQD@T zyZ0=fH+BB+HVW#1u!TbR1yHL7Wt0)_eH?9}nN{Z#GzCy*MK-Mgw4Op+aMTJ-5^V^e zjRe{P6==SVF!T1?nYZ80L4-G;<_ISHYpA)mW9Kvsf!EyTz6a%tKoN{jJ(MF{2;Kn6 zzai@=VZD!G+hAgx842A8bU&=+4GG=$!>9oIKN&%T-zlV556}h;1Iw6XRSp0=I7UDX z9E;2rfL>oJ!ki`mXvKOKC!zuU2~NZffYTa)!aa4eAD4=Xecsr#b^RRuVkH=_0r!R@ z-p#Mhj^5QTS(5iTeiw9_c@mYYRON=NA+n(2P<blgTGn(89*~e69*~0y+4Q9Bpgs8! zl00a(n+5-v6-;V`KRZPknc>G6Ru9=_-64lUKBE!1CGEdb>C;`Ng{@^|mos-HjO%_X z-W7T*crYQj4}{-pgkMhxo&fF?Etn*Pdd}Bn_^@vEXY9OF`W!l>Iu}xX8d^*v%iDA+ zS0AoF)LE-?<LzB!eB7&#UvlM!e)Ui|`vq%t1b@ZH?7Z=?m@-rf?~}r*+4y@t!9O*^ z3#4GyB;3RyB$MaZd7aK&FFW^aXmzJE@2o!OR5-6V7B-RT$s*XD3?41JI4P?UBl&Im zY**N;ga4Bzw9#LW%>dy8T=0YzWMI1FzL5Njpsa?BiMTnFhS>Q7`g|`tf2hg3&w{?k zLz|5riv?}5u&qrVI7^6KPelKL3yGJF?<ZY+l5{u$T<*XQA;f5S^Uu}BEb?XJ5xBqN z1F-CZtIANpxl(YwM_W99m)HA9sjpx~%?qNO@yAL%3z#7u1`m7!XWcR!;>W<-&`z(7 z6(dJQbg2r7Q^Fkp3UN~*QY|rkj<gb{B9@ZN0bY9pVxM%=I%k=?m-l1pkwY;RyIB(= zc5k!L6O2)Xo5=oz0V+W4Agl7s7%Q&d5<Ye^w9IMZOvP4CY$TWYHI|U?A!ij&B?<FO zrT$P;p9~j!%iMMecd2_f6=M!8Q9126cWK*R%4KgS7H`%{x_Ku;3EZWuDihnBrxkdA z<o5pHwpJ55)#vL_AP3q{vICXdXfAP<8ffZMam3z-u1yk^#Dh!r@WugFBW+VajN2O0 zZI9~{aB84&Pdnnaw}V6u3G=eH<I_}j1q@!w5o5}@=Ql*XyG{h4n~EiKKsSYj0l-+P ztR&8JKi>Pzf0#_mVWxOUh1Qc2k%UdIr>JbEr(N*w>E+xo80Tk&@Y&(W)W;Z(b!=WM zih-MOYV#R|1*63^okaMlK~VYExs}+#x?<;ird4fRSG7G>y`yBB%-UPRy*svMRPv>Q zEcxZ*o}BanOa9UD_#fvpw^BrFNv5+kalxf$XYBmrJIjS^gNRmy-~IMQ-PXf~^feDw zI8u;)3sx|SU967lXK48ULXaJ)8YD|JT3{U8De9XxoN$1R9BRNK9kgTD6LP#aRT?oO zQSaDamo18Lc>18=NJnPw-W<Vq=J1`QG6&;4bfmnK^(v|LM5K_$zi;V^0$x3z%E!*! z7(XrEG+jQRA+91~Lq?(h3=t{ecA7Kd|A!=p`W2IFBhhv#hc!FW_8G-fW9RlH9^Uuj z<VZ1W)RyXJhlwJ(I@MY7Z${Db+e*GV8qHokRr4Vct#^@yXf2Q%8d<TW&&ITwE8pj= zn>4Jsf_<_XgsE?OeiP+(^SoAkeW%D_PCXt)`%_A^>a$mI*5fB>=@_q)jKvFq8~eqc zjHt3`udtZj;CdpQ*BiVm^Is2Q&FII7Lkv@_+MBTcyRojK#qUg`U(h!<Oco6P`J+NH zm7sVz`f!&n<E1e}dKqBO4}TcVXxsCX7!tG987@tX<?X42`oHROWA2U}s9P@_#S-zN zHKot_^&76eOO7{;rEP)ot52M;SS7!t++=^2`Q=B>^cO-=`oiBwy7zRei>i%_Fu#62 zDN(m$ASHX4(qEb$J0L*P0pP>$vfQ0!?7`mg?Ur_dTWXW`boE(Q?PY?Cd{$lSggdvj XpiXK~&^_31@sx7zHXZ?kUS0nM9J*WM literal 0 HcmV?d00001 diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll new file mode 100644 index 0000000000..5604d70e8a --- /dev/null +++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll @@ -0,0 +1,138 @@ +; ModuleID = 'build/visc_default/main.visc.ll.kernels.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%rtype = type {} + +; Function Attrs: optsize zeroext +define void @spmv_jds(float* %dst_vector, i64 %bytes_dst_vector, float* %d_data, i64 %bytes_d_data, i32* %d_index, i64 %bytes_d_index, i32* %d_perm, i64 %bytes_d_perm, float* %x_vec, i64 %bytes_x_vec, i32 %dim, i32* %jds_ptr_int, i64 %bytes_jds_ptr_int, i32* %sh_zcnt_int, i64 %bytes_sh_zcnt_int) #0 { +entry: + ;%0 = call i64 @_Z12get_group_idj(i32 0) + ;%1 = trunc i64 %0 to i32 + ;%2 = call i64 @_Z14get_local_sizej(i32 0) + ;%3 = trunc i64 %2 to i32 + ;%4 = mul i32 %1, %3 + ;%5 = call i64 @_Z12get_local_idj(i32 0) + ;%6 = trunc i64 %5 to i32 + ;%7 = add i32 %4, %6 + %0 = add i32 0, 0 + %1 = add i32 0, 0 + %2 = add i32 0, 0 + %3 = add i32 0, 0 + %4 = add i32 0, 0 + %5 = add i32 0, 0 + %6 = call i64 @_Z13get_global_idj(i32 0) + %7 = trunc i64 %6 to i32 + %cmp = icmp slt i32 %7, %dim + br i1 %cmp, label %if.then, label %if.end38 + +if.then: ; preds = %entry + %shr = ashr i32 %7, 5 + %idxprom = sext i32 %shr to i64 + %arrayidx = getelementptr inbounds i32* %sh_zcnt_int, i64 %idxprom + %8 = load i32* %arrayidx, align 4, !tbaa !4 + %9 = load i32* %jds_ptr_int, align 4, !tbaa !4 + %add = add nsw i32 %9, %7 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds float* %d_data, i64 %idxprom3 + %10 = load float* %arrayidx4, align 4, !tbaa !8 + %arrayidx6 = getelementptr inbounds i32* %d_index, i64 %idxprom3 + %11 = load i32* %arrayidx6, align 4, !tbaa !4 + %idxprom7 = sext i32 %11 to i64 + %arrayidx8 = getelementptr inbounds float* %x_vec, i64 %idxprom7 + %12 = load float* %arrayidx8, align 4, !tbaa !8 + %cmp9 = icmp sgt i32 %8, 1 + br i1 %cmp9, label %if.then10, label %if.end + +if.then10: ; preds = %if.then + %arrayidx11 = getelementptr inbounds i32* %jds_ptr_int, i64 1 + %.pn77 = load i32* %arrayidx11, align 4 + %idxprom13.pn.in78 = add nsw i32 %.pn77, %7 + %idxprom13.pn79 = sext i32 %idxprom13.pn.in78 to i64 + %i.0.in80 = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn79 + %i.081 = load i32* %i.0.in80, align 4 + %cmp1582 = icmp sgt i32 %8, 2 + %arrayidx1783 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn79 + %13 = load float* %arrayidx1783, align 4, !tbaa !8 + br i1 %cmp1582, label %for.body, label %for.end + +for.body: ; preds = %for.body, %if.then10 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 2, %if.then10 ] + %14 = phi float [ %16, %for.body ], [ %13, %if.then10 ] + %i.088 = phi i32 [ %i.0, %for.body ], [ %i.081, %if.then10 ] + %sum.086 = phi float [ %add25, %for.body ], [ 0.000000e+00, %if.then10 ] + %t.085 = phi float [ %15, %for.body ], [ %12, %if.then10 ] + %d.084 = phi float [ %14, %for.body ], [ %10, %if.then10 ] + %arrayidx19 = getelementptr inbounds i32* %jds_ptr_int, i64 %indvars.iv + %idxprom23 = sext i32 %i.088 to i64 + %arrayidx24 = getelementptr inbounds float* %x_vec, i64 %idxprom23 + %15 = load float* %arrayidx24, align 4, !tbaa !8 + %mul = fmul fast float %d.084, %t.085 + %add25 = fadd fast float %sum.086, %mul + %indvars.iv.next = add i64 %indvars.iv, 1 + %.pn = load i32* %arrayidx19, align 4 + %idxprom13.pn.in = add nsw i32 %.pn, %7 + %idxprom13.pn = sext i32 %idxprom13.pn.in to i64 + %i.0.in = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn + %i.0 = load i32* %i.0.in, align 4 + %arrayidx17 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn + %16 = load float* %arrayidx17, align 4, !tbaa !8 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %8 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %if.then10 + %.lcssa = phi float [ %13, %if.then10 ], [ %16, %for.body ] + %i.0.lcssa = phi i32 [ %i.081, %if.then10 ], [ %i.0, %for.body ] + %sum.0.lcssa = phi float [ 0.000000e+00, %if.then10 ], [ %add25, %for.body ] + %t.0.lcssa = phi float [ %12, %if.then10 ], [ %15, %for.body ] + %d.0.lcssa = phi float [ %10, %if.then10 ], [ %14, %for.body ] + %idxprom28 = sext i32 %i.0.lcssa to i64 + %arrayidx29 = getelementptr inbounds float* %x_vec, i64 %idxprom28 + %17 = load float* %arrayidx29, align 4, !tbaa !8 + %mul30 = fmul fast float %d.0.lcssa, %t.0.lcssa + %add31 = fadd fast float %sum.0.lcssa, %mul30 + br label %if.end + +if.end: ; preds = %for.end, %if.then + %d.1 = phi float [ %.lcssa, %for.end ], [ %10, %if.then ] + %t.1 = phi float [ %17, %for.end ], [ %12, %if.then ] + %sum.1 = phi float [ %add31, %for.end ], [ 0.000000e+00, %if.then ] + %mul32 = fmul fast float %d.1, %t.1 + %add33 = fadd fast float %sum.1, %mul32 + %idxprom34 = sext i32 %7 to i64 + %arrayidx35 = getelementptr inbounds i32* %d_perm, i64 %idxprom34 + %18 = load i32* %arrayidx35, align 4, !tbaa !4 + %idxprom36 = sext i32 %18 to i64 + %arrayidx37 = getelementptr inbounds float* %dst_vector, i64 %idxprom36 + store float %add33, float* %arrayidx37, align 4, !tbaa !8 + br label %if.end38 + +if.end38: ; preds = %if.end, %entry + ret void +} + +declare i64 @_Z13get_global_idj(i32) + +declare i64 @_Z12get_group_idj(i32) + +declare i64 @_Z14get_local_sizej(i32) + +declare i64 @_Z12get_local_idj(i32) + +attributes #0 = { optsize zeroext "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } + +!visc_hint_gpu = !{} +!visc_hint_cpu = !{!0, !1} +!opencl.kernels = !{!2} + +!0 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32)* undef} +!1 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32, i32)* undef} +!2 = metadata !{void (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64)* @spmv_jds, metadata !3} +!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64"} +!4 = metadata !{metadata !5, metadata !5, i64 0} +!5 = metadata !{metadata !"int", metadata !6} +!6 = metadata !{metadata !"omnipotent char", metadata !7} +!7 = metadata !{metadata !"Simple C/C++ TBAA"} +!8 = metadata !{metadata !9, metadata !9, i64 0} +!9 = metadata !{metadata !"float", metadata !6} diff --git a/hpvm/test/parboil/benchmarks/stencil/Makefile b/hpvm/test/parboil/benchmarks/stencil/Makefile index e761d7b4f5..a44dd0dbf0 100644 --- a/hpvm/test/parboil/benchmarks/stencil/Makefile +++ b/hpvm/test/parboil/benchmarks/stencil/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = stencil -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c index 1157b61988..ec47c22227 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { CHECK_ERROR("clSetKernelArg") // main execution - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); int t; for (t = 0; t < iteration; t++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c index 70a86245b7..61382182d1 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -182,7 +182,7 @@ int main(int argc, char **argv) { // main execution /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 10; i++) { int t; for (t = 0; t < iteration; t++) { @@ -219,7 +219,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c index 3a5dfa3b3a..217352e036 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -182,7 +182,7 @@ int main(int argc, char **argv) { // main execution /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { int t; for (t = 0; t < iteration; t++) { @@ -219,7 +219,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c index 264cec20a9..28c0e5fd7b 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -195,7 +195,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -213,7 +213,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c index 7b5db72237..f767f6a9d2 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_platform_id clPlatform; clStatus = clGetPlatformIDs(1, &clPlatform, NULL); @@ -140,7 +140,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -170,7 +170,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -195,7 +195,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -213,7 +213,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c index 51c263f0ef..10626bed59 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -175,7 +175,7 @@ int main(int argc, char **argv) { // printf("block x is %d and y is %d z \n",block[0],block[1]); // printf("grid x is %d and y is %d\n",grid[0],grid[1]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -200,7 +200,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -218,7 +218,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c index a2a98e9233..1d03111f20 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); cl_int clStatus; cl_uint numPlatforms; @@ -184,7 +184,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); /*for(int i=0; i<1; i++) {*/ for (t = 0; t < iteration; t++) { clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid, @@ -216,7 +216,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll index a288b7649a..9ea545c184 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c index 9fc78af4b9..cf86734a86 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -177,7 +177,7 @@ int main(int argc, char **argv) { printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -190,7 +190,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 2; i++) { for (t = 0; t < iteration; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -204,7 +204,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/ + /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ @@ -226,7 +226,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll index a288b7649a..9ea545c184 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll @@ -1,4 +1,4 @@ -; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' +; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c index a1e1c4e74e..3b009e370e 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); cl_int clStatus; cl_uint numPlatforms; @@ -145,7 +145,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); memcpy(h_Anext, h_A0, sizeof(float) * size); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); // memory allocation d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float), @@ -177,7 +177,7 @@ int main(int argc, char **argv) { printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]); - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0); clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1); clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); @@ -190,7 +190,7 @@ int main(int argc, char **argv) { // main execution pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int t; - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); for (int i = 0; i < 1; i++) { for (t = 0; t < iteration; t++) { /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/ @@ -204,7 +204,7 @@ int main(int argc, char **argv) { d_A0 = d_Anext; d_Anext = d_temp; - /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/ + /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/ clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0); clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext); /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/ @@ -226,7 +226,7 @@ int main(int argc, char **argv) { size * sizeof(float), h_Anext, 0, NULL, NULL); CHECK_ERROR("clEnqueueReadBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); clStatus = clReleaseMemObject(d_A0); clStatus = clReleaseMemObject(d_Anext); clStatus = clReleaseKernel(clKernel); diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile similarity index 80% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile index 35b36dcf3c..cf61fb3a6c 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=file.ll -HPVM_OBJS=stencil.hpvm.ll +VISC_OBJS=stencil.visc.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h rename to hpvm/test/parboil/benchmarks/stencil/src/visc/common.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc rename to hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h rename to hpvm/test/parboil/benchmarks/stencil/src/visc/file.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl rename to hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp similarity index 66% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp rename to hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp index e5810fc810..5672a3ee49 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp @@ -9,11 +9,11 @@ #include "common.h" #include "file.h" -#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <visc.h> static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { int s = 0; @@ -42,23 +42,23 @@ typedef struct __attribute__((__packed__)) { void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, A0, Anext, 1, Anext); + __visc__hint(visc::DEVICE); + __visc__attributes(2, A0, Anext, 1, Anext); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int ly = __hpvm__getNodeInstanceID_y(thisNode); - int lz = __hpvm__getNodeInstanceID_z(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int ly = __visc__getNodeInstanceID_y(thisNode); + int lz = __visc__getNodeInstanceID_z(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int gy = __hpvm__getNodeInstanceID_y(parentNode); - int gz = __hpvm__getNodeInstanceID_z(parentNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int gy = __visc__getNodeInstanceID_y(parentNode); + int gz = __visc__getNodeInstanceID_z(parentNode); - int gridx = __hpvm__getNumNodeInstances_x(thisNode); - int gridy = __hpvm__getNumNodeInstances_y(thisNode); - int gridz = __hpvm__getNumNodeInstances_z(thisNode); + int gridx = __visc__getNumNodeInstances_x(thisNode); + int gridy = __visc__getNumNodeInstances_y(thisNode); + int gridz = __visc__getNumNodeInstances_z(thisNode); int i = gx * gridx + lx + 1; int j = gy * gridy + ly + 1; @@ -78,65 +78,65 @@ void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, A0, Anext, 1, Anext); + __visc__hint(visc::DEVICE); + __visc__attributes(2, A0, Anext, 1, Anext); void *stencil_node = - __hpvm__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); - __hpvm__bindIn(stencil_node, 0, 0, 0); - __hpvm__bindIn(stencil_node, 1, 1, 0); - __hpvm__bindIn(stencil_node, 2, 2, 0); - __hpvm__bindIn(stencil_node, 3, 3, 0); - __hpvm__bindIn(stencil_node, 4, 4, 0); - __hpvm__bindIn(stencil_node, 5, 5, 0); - __hpvm__bindIn(stencil_node, 6, 6, 0); - __hpvm__bindIn(stencil_node, 7, 7, 0); - __hpvm__bindIn(stencil_node, 8, 8, 0); + __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); } void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, size_t dim_Z2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, A0, Anext, 1, Anext); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, A0, Anext, 1, Anext); void *stencil_node = - __hpvm__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); - __hpvm__bindIn(stencil_node, 0, 0, 0); - __hpvm__bindIn(stencil_node, 1, 1, 0); - __hpvm__bindIn(stencil_node, 2, 2, 0); - __hpvm__bindIn(stencil_node, 3, 3, 0); - __hpvm__bindIn(stencil_node, 4, 4, 0); - __hpvm__bindIn(stencil_node, 5, 5, 0); - __hpvm__bindIn(stencil_node, 6, 6, 0); - __hpvm__bindIn(stencil_node, 7, 7, 0); - __hpvm__bindIn(stencil_node, 8, 8, 0); - __hpvm__bindIn(stencil_node, 9, 9, 0); - __hpvm__bindIn(stencil_node, 10, 10, 0); - __hpvm__bindIn(stencil_node, 11, 11, 0); + __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); + __visc__bindIn(stencil_node, 9, 9, 0); + __visc__bindIn(stencil_node, 10, 10, 0); + __visc__bindIn(stencil_node, 11, 11, 0); } void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext, size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1, size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2, size_t dim_Z2) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, A0, Anext, 1, Anext); - void *stencil_node = __hpvm__createNodeND(0, stencilLvl2); - __hpvm__bindIn(stencil_node, 0, 0, 0); - __hpvm__bindIn(stencil_node, 1, 1, 0); - __hpvm__bindIn(stencil_node, 2, 2, 0); - __hpvm__bindIn(stencil_node, 3, 3, 0); - __hpvm__bindIn(stencil_node, 4, 4, 0); - __hpvm__bindIn(stencil_node, 5, 5, 0); - __hpvm__bindIn(stencil_node, 6, 6, 0); - __hpvm__bindIn(stencil_node, 7, 7, 0); - __hpvm__bindIn(stencil_node, 8, 8, 0); - __hpvm__bindIn(stencil_node, 9, 9, 0); - __hpvm__bindIn(stencil_node, 10, 10, 0); - __hpvm__bindIn(stencil_node, 11, 11, 0); - __hpvm__bindIn(stencil_node, 12, 12, 0); - __hpvm__bindIn(stencil_node, 13, 13, 0); - __hpvm__bindIn(stencil_node, 14, 14, 0); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, A0, Anext, 1, Anext); + void *stencil_node = __visc__createNodeND(0, stencilLvl2); + __visc__bindIn(stencil_node, 0, 0, 0); + __visc__bindIn(stencil_node, 1, 1, 0); + __visc__bindIn(stencil_node, 2, 2, 0); + __visc__bindIn(stencil_node, 3, 3, 0); + __visc__bindIn(stencil_node, 4, 4, 0); + __visc__bindIn(stencil_node, 5, 5, 0); + __visc__bindIn(stencil_node, 6, 6, 0); + __visc__bindIn(stencil_node, 7, 7, 0); + __visc__bindIn(stencil_node, 8, 8, 0); + __visc__bindIn(stencil_node, 9, 9, 0); + __visc__bindIn(stencil_node, 10, 10, 0); + __visc__bindIn(stencil_node, 11, 11, 0); + __visc__bindIn(stencil_node, 12, 12, 0); + __visc__bindIn(stencil_node, 13, 13, 0); + __visc__bindIn(stencil_node, 14, 14, 0); } int main(int argc, char **argv) { @@ -195,11 +195,11 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); - llvm_hpvm_track_mem(h_A0, sizeof(float) * size); - llvm_hpvm_track_mem(h_Anext, sizeof(float) * size); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_A0, sizeof(float) * size); + llvm_visc_track_mem(h_Anext, sizeof(float) * size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -241,9 +241,9 @@ int main(int argc, char **argv) { grid[1] / block[1], grid[2] / block[2]}; *(RootIn *)root_in = root_in_local; - void *stencilDFG = __hpvm__launch(0, stencilLvl3, root_in); + void *stencilDFG = __visc__launch(0, stencilLvl3, root_in); - __hpvm__wait(stencilDFG); + __visc__wait(stencilDFG); // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *h_temp = h_A0; @@ -255,19 +255,19 @@ int main(int argc, char **argv) { h_A0 = h_Anext; h_Anext = h_temp; pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(h_Anext, bytes); + llvm_visc_request_mem(h_Anext, bytes); printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(h_A0); - llvm_hpvm_untrack_mem(h_Anext); + llvm_visc_untrack_mem(h_A0); + llvm_visc_untrack_mem(h_Anext); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h similarity index 100% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h rename to hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c similarity index 90% rename from hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c rename to hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c index 35c5ed960c..bb6e45c932 100644 --- a/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c @@ -9,11 +9,11 @@ #include "common.h" #include "file.h" -#include <hpvm.h> #include <parboil.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <visc.h> static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { int s = 0; @@ -31,7 +31,7 @@ static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) { void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny, int nz) { - __hpvm__attributes(2, A0, Anext, 1, Anext); + __visc__attributes(2, A0, Anext, 1, Anext); int i = get_global_id(0) + 1; int j = get_global_id(1) + 1; int k = get_global_id(2) + 1; @@ -106,11 +106,11 @@ int main(int argc, char **argv) { fclose(fp); pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK); - llvm_hpvm_track_mem(h_A0, sizeof(float) * size); - llvm_hpvm_track_mem(h_Anext, sizeof(float) * size); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK); + llvm_visc_track_mem(h_A0, sizeof(float) * size); + llvm_visc_track_mem(h_Anext, sizeof(float) * size); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); @@ -133,11 +133,11 @@ int main(int argc, char **argv) { printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]); for (t = 0; t < iteration; t++) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); - unsigned stencilDFG = __hpvm__node( + unsigned stencilDFG = __visc__node( naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0], grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0); - __hpvm__wait(stencilDFG); + __visc__wait(stencilDFG); // printf("iteration %d\n",t); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *h_temp = h_A0; @@ -149,19 +149,19 @@ int main(int argc, char **argv) { h_A0 = h_Anext; h_Anext = h_temp; pb_SwitchToTimer(&timers, pb_TimerID_COPY); - llvm_hpvm_request_mem(h_Anext, bytes); + llvm_visc_request_mem(h_Anext, bytes); printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]); printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]); - pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK); + pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK); - llvm_hpvm_untrack_mem(h_A0); - llvm_hpvm_untrack_mem(h_Anext); + llvm_visc_untrack_mem(h_A0); + llvm_visc_untrack_mem(h_Anext); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); if (parameters->outFile) { /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/ diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll new file mode 100644 index 0000000000..7dc32f3760 --- /dev/null +++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll @@ -0,0 +1,673 @@ +; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s +; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc +; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s +; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll -S -o %t.linked.ll +; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin +; ModuleID = 'build/visc_vec_default/stencil.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%rtype = type {} +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%struct.pb_Parameters = type { i8*, i8** } +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.arg = type <{ float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, %rtype }> + +@.str3 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str4 = private unnamed_addr constant [37 x i8] c"grid(%d, %d, %d), block(%d, %d, %d)\0A\00", align 1 +@.str5 = private unnamed_addr constant [17 x i8] c"A[126,1,1] = %f\0A\00", align 1 +@.str6 = private unnamed_addr constant [17 x i8] c"A[125,1,1] = %f\0A\00", align 1 +@str = private unnamed_addr constant [46 x i8] c"OpenCL accelerated 7 points stencil codes****\00" +@str7 = private unnamed_addr constant [45 x i8] c"Author: Li-Wen Chang <lchang20@illinois.edu>\00" +@str8 = private unnamed_addr constant [106 x i8] c"Usage: probe nx ny nz t\0Anx: the grid size x\0Any: the grid size y\0Anz: the grid size z\0At: the iteration time\00" +@viscTimerSet_GenVISC = common global i8* null +@0 = internal constant [14 x i8] c"GenVISC_Timer\00" + +; Function Attrs: nounwind uwtable +define %rtype @naive_kernel(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz) #0 { +entry: + %naive_kernel.node = call i8* @llvm.visc.getNode() + %naive_kernel.parentNode = call i8* @llvm.visc.getParentNode(i8* %naive_kernel.node) + %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.parentNode) + %a1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %naive_kernel.node) + %a2 = mul i32 %a0, %a1 + %a3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.node) + %a4 = add i32 %a2, %a3 + ;%add = add nsw i32 %4, 1 + %a5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.parentNode) + %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %naive_kernel.node) + %a7 = mul i32 %a5, %a6 + %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.node) + %a9 = add i32 %a7, %a8 + ;%add3 = add nsw i32 %9, 1 + %a10 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.parentNode) + %a11 = call i32 @llvm.visc.getNumNodeInstances.z(i8* %naive_kernel.node) + %a12 = mul i32 %a10, %a11 + %a13 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.node) + %a14 = add i32 %a12, %a13 + ;%sub = add nsw i32 %nx, -1 + ;%cmp = icmp slt i32 %add, %sub + ;br i1 %cmp, label %if.then, label %if.end + + + ;%call = tail call i32 @get_global_id(i32 0) #2 + ;%mul = shl i32 %call, 2 + %mul = shl i32 %a4, 2 + %add258 = or i32 %mul, 1 + ;%call1 = tail call i32 @get_global_id(i32 1) #2 + ;%add2 = add i32 %call1, 1 + %add2 = add i32 %a9, 1 + ;%call3 = tail call i32 @get_global_id(i32 2) #2 + ;%add4 = add i32 %call3, 1 + %add4 = add i32 %a14, 1 + %sub = add i32 %add258, 3 + %sub6 = add i32 %nx, -1 + %cmp = icmp slt i32 %sub, %sub6 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %mul7 = mul nsw i32 %add4, %ny + %add8 = add nsw i32 %mul7, %add2 + %mul9 = mul nsw i32 %add8, %nx + %add11 = add i32 %sub, %mul9 + %add.ptr = getelementptr inbounds float* %A0, i32 %add11 + ;%call12 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr) #2 + %vadd.ptr = bitcast float* %add.ptr to <4 x float>* + %call12 = load <4 x float>* %vadd.ptr + + %add13 = add i32 %a14, 2 + %mul14 = mul nsw i32 %add13, %ny + %add15 = add nsw i32 %mul14, %add2 + %mul16 = mul nsw i32 %add15, %nx + %add18 = add i32 %sub, %mul16 + %add.ptr19 = getelementptr inbounds float* %A0, i32 %add18 + ;%call20 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr19) #2 + %vadd.ptr19 = bitcast float* %add.ptr19 to <4 x float>* + %call20 = load <4 x float>* %vadd.ptr19 + + %mul22 = mul nsw i32 %a14, %ny + %add23 = add nsw i32 %mul22, %add2 + %mul24 = mul nsw i32 %add23, %nx + %add26 = add i32 %sub, %mul24 + %add.ptr27 = getelementptr inbounds float* %A0, i32 %add26 + ;%call28 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr27) #2 + %vadd.ptr27 = bitcast float* %add.ptr27 to <4 x float>* + %call28 = load <4 x float>* %vadd.ptr27 + + %add29 = add i32 %a9, 2 + %add31 = add nsw i32 %add29, %mul7 + %mul32 = mul nsw i32 %add31, %nx + %add34 = add i32 %sub, %mul32 + %add.ptr35 = getelementptr inbounds float* %A0, i32 %add34 + ;%call36 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr35) #2 + %vadd.ptr35 = bitcast float* %add.ptr35 to <4 x float>* + %call36 = load <4 x float>* %vadd.ptr35 + + %add39 = add nsw i32 %mul7, %a9 + %mul40 = mul nsw i32 %add39, %nx + %add42 = add i32 %sub, %mul40 + %add.ptr43 = getelementptr inbounds float* %A0, i32 %add42 + ;%call44 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr43) #2 + %vadd.ptr43 = bitcast float* %add.ptr43 to <4 x float>* + %call44 = load <4 x float>* %vadd.ptr43 + + %add49 = add i32 %add258, 4 + %add50 = add i32 %add49, %mul9 + %arrayidx = getelementptr inbounds float* %A0, i32 %add50 + %0 = load float* %arrayidx, align 4, !tbaa !2 + %add55261 = or i32 %mul, 3 + %add56 = add i32 %add55261, %mul9 + %arrayidx57 = getelementptr inbounds float* %A0, i32 %add56 + %1 = load float* %arrayidx57, align 4, !tbaa !2 + %2 = shufflevector <4 x float> %call12, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3> + %vext = shufflevector <3 x float> %2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> + %vecinit58 = insertelement <4 x float> %vext, float %0, i32 3 + %vecinit60 = insertelement <4 x float> undef, float %1, i32 0 + %vecinit62 = shufflevector <4 x float> %vecinit60, <4 x float> %call12, <4 x i32> <i32 0, i32 4, i32 5, i32 6> + %splat.splatinsert = insertelement <4 x float> undef, float %c1, i32 0 + %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %add63 = fadd fast <4 x float> %call20, %call28 + %add64 = fadd fast <4 x float> %add63, %call36 + %add65 = fadd fast <4 x float> %add64, %call44 + %add66 = fadd fast <4 x float> %add65, %vecinit58 + %add67 = fadd fast <4 x float> %add66, %vecinit62 + %mul68 = fmul fast <4 x float> %splat.splat, %add67 + %splat.splatinsert69 = insertelement <4 x float> undef, float %c0, i32 0 + %splat.splat70 = shufflevector <4 x float> %splat.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer + %mul71 = fmul fast <4 x float> %splat.splat70, %call12 + %sub72 = fsub fast <4 x float> %mul68, %mul71 + %add.ptr78 = getelementptr inbounds float* %Anext, i32 %add11 + ;tail call void @_Z7vstore4Dv4_fjPU3AS1f(<4 x float> %sub72, i32 0, float* %add.ptr78) #2 + %vadd.ptr78 = bitcast float* %add.ptr78 to <4 x float>* + store <4 x float> %sub72, <4 x float>* %vadd.ptr78 + + br label %if.end146 + +if.else: ; preds = %entry + %cmp80 = icmp slt i32 %add258, %sub6 + br i1 %cmp80, label %for.body.lr.ph, label %if.end146 + +for.body.lr.ph: ; preds = %if.else + %add84 = add i32 %a14, 2 + %mul85 = mul nsw i32 %add84, %ny + %add86 = add nsw i32 %mul85, %add2 + %mul87 = mul nsw i32 %add86, %nx + %add88 = add i32 %mul87, 3 + %mul92 = mul nsw i32 %a14, %ny + %add93 = add nsw i32 %mul92, %add2 + %mul94 = mul nsw i32 %add93, %nx + %add95 = add i32 %mul94, 3 + %add99 = add i32 %a9, 2 + %mul100 = mul nsw i32 %add4, %ny + %add101 = add nsw i32 %add99, %mul100 + %mul102 = mul nsw i32 %add101, %nx + %add103 = add i32 %mul102, 3 + %add109 = add nsw i32 %mul100, %a9 + %mul110 = mul nsw i32 %add109, %nx + %add111 = add i32 %mul110, 3 + %add117 = add nsw i32 %mul100, %add2 + %mul118 = mul nsw i32 %add117, %nx + %add119 = add i32 %mul118, 3 + %add127 = add i32 %mul118, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %vid.0260 = phi i32 [ %add258, %for.body.lr.ph ], [ %add115, %for.body ] + %add89 = add i32 %add88, %vid.0260 + %arrayidx90 = getelementptr inbounds float* %A0, i32 %add89 + %3 = load float* %arrayidx90, align 4, !tbaa !2 + %add96 = add i32 %add95, %vid.0260 + %arrayidx97 = getelementptr inbounds float* %A0, i32 %add96 + %4 = load float* %arrayidx97, align 4, !tbaa !2 + %add98 = fadd fast float %3, %4 + %add104 = add i32 %add103, %vid.0260 + %arrayidx105 = getelementptr inbounds float* %A0, i32 %add104 + %5 = load float* %arrayidx105, align 4, !tbaa !2 + %add106 = fadd fast float %add98, %5 + %add112 = add i32 %add111, %vid.0260 + %arrayidx113 = getelementptr inbounds float* %A0, i32 %add112 + %6 = load float* %arrayidx113, align 4, !tbaa !2 + %add114 = fadd fast float %add106, %6 + %add115 = add nsw i32 %vid.0260, 1 + %add120 = add i32 %add119, %add115 + %arrayidx121 = getelementptr inbounds float* %A0, i32 %add120 + %7 = load float* %arrayidx121, align 4, !tbaa !2 + %add122 = fadd fast float %add114, %7 + %add128 = add i32 %add127, %vid.0260 + %arrayidx129 = getelementptr inbounds float* %A0, i32 %add128 + %8 = load float* %arrayidx129, align 4, !tbaa !2 + %add130 = fadd fast float %add122, %8 + %mul131 = fmul fast float %add130, %c1 + %add136 = add i32 %add119, %vid.0260 + %arrayidx137 = getelementptr inbounds float* %A0, i32 %add136 + %9 = load float* %arrayidx137, align 4, !tbaa !2 + %mul138 = fmul fast float %9, %c0 + %sub139 = fsub fast float %mul131, %mul138 + %arrayidx145 = getelementptr inbounds float* %Anext, i32 %add136 + store float %sub139, float* %arrayidx145, align 4, !tbaa !2 + %exitcond = icmp eq i32 %add115, %sub6 + br i1 %exitcond, label %if.end146, label %for.body + +if.end146: ; preds = %for.body, %if.else, %if.then + ;ret void + + + + + +;if.then: ; preds = %entry + ;%add5 = add nsw i32 %14, 1 + ;%add6 = add nsw i32 %14, 2 + ;%mul = mul nsw i32 %add6, %ny + ;%add7 = add nsw i32 %mul, %add3 + ;%mul8 = mul nsw i32 %add7, %nx + ;%add9 = add i32 %4, 4 + ;%add10 = add i32 %add9, %mul8 + ;%idxprom = sext i32 %add10 to i64 + ;%arrayidx = getelementptr inbounds float* %A0, i64 %idxprom + ;%15 = load float* %arrayidx, align 4, !tbaa !2 + ;%mul12 = mul nsw i32 %14, %ny + ;%add13 = add nsw i32 %mul12, %add3 + ;%mul14 = mul nsw i32 %add13, %nx + ;%add16 = add i32 %add9, %mul14 + ;%idxprom17 = sext i32 %add16 to i64 + ;%arrayidx18 = getelementptr inbounds float* %A0, i64 %idxprom17 + ;%16 = load float* %arrayidx18, align 4, !tbaa !2 + ;%add19 = fadd fast float %15, %16 + ;%add20 = add nsw i32 %9, 2 + ;%mul21 = mul nsw i32 %add5, %ny + ;%add22 = add nsw i32 %add20, %mul21 + ;%mul23 = mul nsw i32 %add22, %nx + ;%add25 = add i32 %add9, %mul23 + ;%idxprom26 = sext i32 %add25 to i64 + ;%arrayidx27 = getelementptr inbounds float* %A0, i64 %idxprom26 + ;%17 = load float* %arrayidx27, align 4, !tbaa !2 + ;%add28 = fadd fast float %add19, %17 + ;%add31 = add nsw i32 %mul21, %9 + ;%mul32 = mul nsw i32 %add31, %nx + ;%add34 = add i32 %add9, %mul32 + ;%idxprom35 = sext i32 %add34 to i64 + ;%arrayidx36 = getelementptr inbounds float* %A0, i64 %idxprom35 + ;%18 = load float* %arrayidx36, align 4, !tbaa !2 + ;%add37 = fadd fast float %add28, %18 + ;%add40 = add nsw i32 %mul21, %add3 + ;%mul41 = mul nsw i32 %add40, %nx + ;%add42 = add i32 %4, 5 + ;%add43 = add i32 %add42, %mul41 + ;%idxprom44 = sext i32 %add43 to i64 + ;%arrayidx45 = getelementptr inbounds float* %A0, i64 %idxprom44 + ;%19 = load float* %arrayidx45, align 4, !tbaa !2 + ;%add46 = fadd fast float %add37, %19 + ;%add51 = add i32 %4, 3 + ;%add52 = add i32 %add51, %mul41 + ;%idxprom53 = sext i32 %add52 to i64 + ;%arrayidx54 = getelementptr inbounds float* %A0, i64 %idxprom53 + ;%20 = load float* %arrayidx54, align 4, !tbaa !2 + ;%add55 = fadd fast float %add46, %20 + ;%mul56 = fmul fast float %add55, %c1 + ;%add61 = add i32 %add9, %mul41 + ;%idxprom62 = sext i32 %add61 to i64 + ;%arrayidx63 = getelementptr inbounds float* %A0, i64 %idxprom62 + ;%21 = load float* %arrayidx63, align 4, !tbaa !2 + ;%mul64 = fmul fast float %21, %c0 + ;%sub65 = fsub fast float %mul56, %mul64 + ;%arrayidx72 = getelementptr inbounds float* %Anext, i64 %idxprom62 + ;store float %sub65, float* %arrayidx72, align 4, !tbaa !2 + ;br label %if.end + +;if.end: ; preds = %if.then, %entry + ret %rtype undef +} + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #0 { +entry: + %argc.addr = alloca i32, align 4 + %timers = alloca %struct.pb_TimerSet, align 8 + store i32 %argc, i32* %argc.addr, align 4, !tbaa !5 + %0 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 800, i8* %0) #1 + %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8]* @str, i64 0, i64 0)) + %puts186 = call i32 @puts(i8* getelementptr inbounds ([45 x i8]* @str7, i64 0, i64 0)) + %call2 = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1 + %1 = load i32* %argc.addr, align 4, !tbaa !5 + %cmp = icmp slt i32 %1, 5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %puts187 = call i32 @puts(i8* getelementptr inbounds ([106 x i8]* @str8, i64 0, i64 0)) + br label %cleanup + +if.end: ; preds = %entry + %arrayidx = getelementptr inbounds i8** %argv, i64 1 + %2 = load i8** %arrayidx, align 8, !tbaa !6 + %call.i = call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #1 + %conv.i = trunc i64 %call.i to i32 + %cmp5 = icmp slt i32 %conv.i, 1 + br i1 %cmp5, label %cleanup, label %if.end7 + +if.end7: ; preds = %if.end + %arrayidx8 = getelementptr inbounds i8** %argv, i64 2 + %3 = load i8** %arrayidx8, align 8, !tbaa !6 + %call.i188 = call i64 @strtol(i8* nocapture %3, i8** null, i32 10) #1 + %conv.i189 = trunc i64 %call.i188 to i32 + %cmp10 = icmp slt i32 %conv.i189, 1 + br i1 %cmp10, label %cleanup, label %if.end12 + +if.end12: ; preds = %if.end7 + %arrayidx13 = getelementptr inbounds i8** %argv, i64 3 + %4 = load i8** %arrayidx13, align 8, !tbaa !6 + %call.i190 = call i64 @strtol(i8* nocapture %4, i8** null, i32 10) #1 + %conv.i191 = trunc i64 %call.i190 to i32 + %cmp15 = icmp slt i32 %conv.i191, 1 + br i1 %cmp15, label %cleanup, label %if.end17 + +if.end17: ; preds = %if.end12 + %arrayidx18 = getelementptr inbounds i8** %argv, i64 4 + %5 = load i8** %arrayidx18, align 8, !tbaa !6 + %call.i192 = call i64 @strtol(i8* nocapture %5, i8** null, i32 10) #1 + %conv.i193 = trunc i64 %call.i192 to i32 + %cmp20 = icmp slt i32 %conv.i193, 1 + br i1 %cmp20, label %cleanup, label %for.cond1.preheader.lr.ph.i + +for.cond1.preheader.lr.ph.i: ; preds = %if.end17 + %mul = shl i64 %call.i, 32 + %mul23 = mul i64 %mul, %call.i188 + %sext = mul i64 %mul23, %call.i190 + %add = ashr exact i64 %sext, 30 + %mul24 = add i64 %add, 12 + %call25 = call noalias i8* @malloc(i64 %mul24) #1 + %6 = bitcast i8* %call25 to float* + %call27 = call noalias i8* @malloc(i64 %mul24) #1 + %7 = bitcast i8* %call27 to float* + %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 1 + %8 = load i8*** %inpFiles, align 8, !tbaa !6 + %9 = load i8** %8, align 8, !tbaa !6 + %call29 = call %struct._IO_FILE* @fopen(i8* %9, i8* getelementptr inbounds ([3 x i8]* @.str3, i64 0, i64 0)) #1 + %add.ptr = getelementptr inbounds i8* %call25, i64 12 + %10 = bitcast i8* %add.ptr to float* + %cmp24.i = icmp sgt i32 %conv.i189, 0 + %cmp51.i = icmp sgt i32 %conv.i, 0 + %or.cond = and i1 %cmp24.i, %cmp51.i + br i1 %or.cond, label %for.cond4.preheader.lr.ph.us.i.preheader.split.us, label %read_data.exit + +for.cond4.preheader.lr.ph.us.i.preheader.split.us: ; preds = %for.cond1.preheader.lr.ph.i + %11 = mul i32 %conv.i, %conv.i189 + br label %for.body6.lr.ph.us.us.i.preheader.us + +for.body6.lr.ph.us.us.i.us: ; preds = %for.body6.lr.ph.us.us.i.preheader.us, %for.inc8.us.us.i.us + %j.06.us.us.i.us = phi i32 [ %inc9.us.us.i.us, %for.inc8.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.preheader.us ] + %s.15.us.us.i.us = phi i32 [ %14, %for.inc8.us.us.i.us ], [ %s.09.us.i.us, %for.body6.lr.ph.us.us.i.preheader.us ] + %12 = sext i32 %s.15.us.us.i.us to i64 + br label %for.body6.us.us.i.us + +for.body6.us.us.i.us: ; preds = %for.body6.us.us.i.us, %for.body6.lr.ph.us.us.i.us + %indvars.iv.i.us = phi i64 [ %indvars.iv.next.i.us, %for.body6.us.us.i.us ], [ %12, %for.body6.lr.ph.us.us.i.us ] + %k.03.us.us.i.us = phi i32 [ %inc7.us.us.i.us, %for.body6.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.us ] + %add.ptr.us.us.i.us = getelementptr inbounds float* %10, i64 %indvars.iv.i.us + %13 = bitcast float* %add.ptr.us.us.i.us to i8* + %call.us.us.i.us = call i64 @fread(i8* %13, i64 4, i64 1, %struct._IO_FILE* %call29) #1 + %indvars.iv.next.i.us = add i64 %indvars.iv.i.us, 1 + %inc7.us.us.i.us = add nsw i32 %k.03.us.us.i.us, 1 + %exitcond.i.us = icmp eq i32 %inc7.us.us.i.us, %conv.i + br i1 %exitcond.i.us, label %for.inc8.us.us.i.us, label %for.body6.us.us.i.us + +for.inc8.us.us.i.us: ; preds = %for.body6.us.us.i.us + %14 = add i32 %s.15.us.us.i.us, %conv.i + %inc9.us.us.i.us = add nsw i32 %j.06.us.us.i.us, 1 + %exitcond33.i.us = icmp eq i32 %inc9.us.us.i.us, %conv.i189 + br i1 %exitcond33.i.us, label %for.inc11.us.i.us, label %for.body6.lr.ph.us.us.i.us + +for.inc11.us.i.us: ; preds = %for.inc8.us.us.i.us + %15 = add i32 %11, %s.09.us.i.us + %inc12.us.i.us = add nsw i32 %i.010.us.i.us, 1 + %exitcond34.i.us = icmp eq i32 %inc12.us.i.us, %conv.i191 + br i1 %exitcond34.i.us, label %read_data.exit, label %for.body6.lr.ph.us.us.i.preheader.us + +for.body6.lr.ph.us.us.i.preheader.us: ; preds = %for.inc11.us.i.us, %for.cond4.preheader.lr.ph.us.i.preheader.split.us + %i.010.us.i.us = phi i32 [ %inc12.us.i.us, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ] + %s.09.us.i.us = phi i32 [ %15, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ] + br label %for.body6.lr.ph.us.us.i.us + +read_data.exit: ; preds = %for.inc11.us.i.us, %for.cond1.preheader.lr.ph.i + %call31 = call i32 @fclose(%struct._IO_FILE* %call29) #1 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 + %16 = call i8* @llvm_visc_initializeTimerSet() + store i8* %16, i8** @viscTimerSet_GenVISC + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + call void @llvm.visc.init() + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 15) #1 + call void @llvm_visc_track_mem(i8* %call25, i64 %mul24) #1 + call void @llvm_visc_track_mem(i8* %call27, i64 %mul24) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call27, i8* %call25, i64 %mul24, i32 4, i1 false) + %sub40 = add nsw i32 %conv.i, 253 + %div = sdiv i32 %sub40, 256 + %mul42 = shl nsw i32 %div, 6 + %sub44 = add nsw i32 %conv.i189, -2 + %sub46 = add nsw i32 %conv.i191, -2 + %call53 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([37 x i8]* @.str4, i64 0, i64 0), i32 %mul42, i32 %sub44, i32 %sub46, i32 64, i32 1, i32 1) #1 + %add56 = add nsw i32 %conv.i189, 1 + %mul57 = mul nsw i32 %add56, %conv.i + %add59 = add nsw i32 %mul57, 129 + %idxprom = sext i32 %add59 to i64 + %arrayidx60 = getelementptr inbounds float* %6, i64 %idxprom + %17 = load float* %arrayidx60, align 4, !tbaa !2 + %conv61 = fpext float %17 to double + %call62 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv61) #1 + %add67 = add nsw i32 %mul57, 128 + %idxprom68 = sext i32 %add67 to i64 + %arrayidx69 = getelementptr inbounds float* %6, i64 %idxprom68 + %18 = load float* %arrayidx69, align 4, !tbaa !2 + %conv70 = fpext float %18 to double + %call71 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv70) #1 + %cmp72194 = icmp sgt i32 %conv.i193, 0 + br i1 %cmp72194, label %for.body, label %for.end + +for.body: ; preds = %for.body, %read_data.exit + %h_A0.0197 = phi float* [ %h_Anext.0196, %for.body ], [ %6, %read_data.exit ] + %h_Anext.0196 = phi float* [ %h_A0.0197, %for.body ], [ %7, %read_data.exit ] + %t.0195 = phi i32 [ %inc, %for.body ], [ 0, %read_data.exit ] + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19) + %in.addr = alloca %struct.arg + %in.addr.c0 = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + %in.addr.c0.cast = fptrunc double 0x3FC5555560000000 to float + store float %in.addr.c0.cast, float* %in.addr.c0 + %in.addr.c1 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + %in.addr.c1.cast = fptrunc double 0x3F9C71C720000000 to float + store float %in.addr.c1.cast, float* %in.addr.c1 + %in.addr.A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + store float* %h_A0.0197, float** %in.addr.A0 + %in.addr.bytes_A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + store i64 %mul24, i64* %in.addr.bytes_A0 + %in.addr.Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + store float* %h_Anext.0196, float** %in.addr.Anext + %in.addr.bytes_Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + store i64 %mul24, i64* %in.addr.bytes_Anext + %in.addr.nx = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + store i32 %conv.i, i32* %in.addr.nx + %in.addr.ny = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + store i32 %conv.i189, i32* %in.addr.ny + %in.addr.nz = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store i32 %conv.i191, i32* %in.addr.nz + %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + store i32 64, i32* %in.addr.dimX0 + %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 10 + store i32 1, i32* %in.addr.dimY0 + %in.addr.dimZ1 = getelementptr %struct.arg* %in.addr, i32 0, i32 11 + store i32 1, i32* %in.addr.dimZ1 + %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 12 + store i32 %div, i32* %in.addr.dimX1 + %in.addr.dimY2 = getelementptr %struct.arg* %in.addr, i32 0, i32 13 + store i32 %sub44, i32* %in.addr.dimY2 + %in.addr.dimZ2 = getelementptr %struct.arg* %in.addr, i32 0, i32 14 + store i32 %sub46, i32* %in.addr.dimZ2 + %args = bitcast %struct.arg* %in.addr to i8* + call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0) + %graphnaive_kernelInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2 to i8*), i8* %args) + call void @llvm.visc.wait(i8* %graphnaive_kernelInternal_level2) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1 + %inc = add nsw i32 %t.0195, 1 + %exitcond = icmp eq i32 %inc, %conv.i193 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %read_data.exit + %h_A0.0.lcssa = phi float* [ %6, %read_data.exit ], [ %h_Anext.0196, %for.body ] + %h_Anext.0.lcssa = phi float* [ %7, %read_data.exit ], [ %h_A0.0197, %for.body ] + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1 + %19 = bitcast float* %h_A0.0.lcssa to i8* + call void @llvm_visc_request_mem(i8* %19, i64 %mul24) #1 + %arrayidx97 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom + %20 = load float* %arrayidx97, align 4, !tbaa !2 + %conv98 = fpext float %20 to double + %call99 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv98) #1 + %arrayidx106 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom68 + %21 = load float* %arrayidx106, align 4, !tbaa !2 + %conv107 = fpext float %21 to double + %call108 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv107) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1 + %22 = bitcast float* %h_Anext.0.lcssa to i8* + call void @llvm_visc_untrack_mem(i8* %22) #1 + call void @llvm_visc_untrack_mem(i8* %19) #1 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 + %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0 + call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr) + call void @llvm.visc.cleanup() + %outFile = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 0 + %23 = load i8** %outFile, align 8, !tbaa !6 + %tobool = icmp eq i8* %23, null + br i1 %tobool, label %if.end113, label %if.then110 + +if.then110: ; preds = %for.end + %add.ptr112 = getelementptr inbounds float* %h_A0.0.lcssa, i64 3 + call void @outputData(i8* %23, float* %add.ptr112, i32 %conv.i, i32 %conv.i189, i32 %conv.i191) #1 + br label %if.end113 + +if.end113: ; preds = %if.then110, %for.end + call void @free(i8* %22) #1 + call void @free(i8* %19) #1 + call void @pb_FreeParameters(%struct.pb_Parameters* %call2) #1 + br label %cleanup + +cleanup: ; preds = %if.end113, %if.end17, %if.end12, %if.end7, %if.end, %if.then + %retval.0 = phi i32 [ -1, %if.then ], [ 0, %if.end113 ], [ -1, %if.end ], [ -1, %if.end7 ], [ -1, %if.end12 ], [ -1, %if.end17 ] + call void @llvm.lifetime.end(i64 800, i8* %0) #1 + ret i32 %retval.0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +declare i32 @printf(i8* nocapture, ...) #2 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #3 + +; Function Attrs: nounwind +declare noalias i8* @malloc(i64) #2 + +; Function Attrs: nounwind +declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #2 + +; Function Attrs: nounwind +declare i32 @fclose(%struct._IO_FILE* nocapture) #2 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #3 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #3 + +declare void @llvm_visc_track_mem(i8*, i64) #3 + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1 + +declare void @llvm_visc_request_mem(i8*, i64) #3 + +declare void @llvm_visc_untrack_mem(i8*) #3 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #3 + +declare void @outputData(i8*, float*, i32, i32, i32) #3 + +; Function Attrs: nounwind +declare void @free(i8* nocapture) #2 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #3 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2 + +; Function Attrs: nounwind +declare i64 @strtol(i8*, i8** nocapture, i32) #2 + +; Function Attrs: nounwind +declare i32 @puts(i8* nocapture) #1 + +declare i8* @llvm_visc_initializeTimerSet() + +declare void @llvm_visc_switchToTimer(i8**, i32) + +declare void @llvm_visc_printTimerSet(i8**, i8*) + +; Function Attrs: nounwind +declare void @llvm.visc.init() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #1 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.z(i8*) #1 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumNodeInstances.z(i8*) #1 + +; Function Attrs: nounwind uwtable +define %rtype @naive_kernelInternal_level1(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ) #0 { +entry: + %naive_kernel.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32)* @naive_kernel to i8*), i32 %dimX, i32 %dimY, i32 %dimZ) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 8, i32 8) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) #1 + +; Function Attrs: nounwind uwtable +define %rtype @naive_kernelInternal_level2(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ, i32 %dimX1, i32 %dimY2, i32 %dimZ3) #0 { +entry: + %naive_kernelInternal_level1.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1 to i8*), i32 %dimX1, i32 %dimY2, i32 %dimZ3) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 1, i32 1) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 2, i32 2) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 3, i32 3) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 4, i32 4) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 6, i32 6) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 7, i32 7) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 8, i32 8) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 9, i32 9) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 10, i32 10) + call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 11, i32 11) + ret %rtype undef +} + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #1 + +; Function Attrs: nounwind +declare void @llvm.visc.cleanup() #1 + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } + +!visc_hint_gpu = !{} +!visc_hint_cpu = !{!0, !1} + +!0 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1} +!1 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2} +!2 = metadata !{metadata !"float", metadata !3} +!3 = metadata !{metadata !"omnipotent char", metadata !4} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !"int", metadata !3} +!6 = metadata !{metadata !"any pointer", metadata !3} diff --git a/hpvm/test/parboil/benchmarks/tpacf/Makefile b/hpvm/test/parboil/benchmarks/tpacf/Makefile index e76139ba38..6140acd5ac 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/Makefile +++ b/hpvm/test/parboil/benchmarks/tpacf/Makefile @@ -1,9 +1,9 @@ PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil APP = tpacf -# Default compile hpvm +# Default compile visc ifeq ($(VERSION),) - VERSION = hpvm + VERSION = visc endif # Default use small test case diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc index d89d556a10..d945bccf4e 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc @@ -199,7 +199,7 @@ int main(int argc, char **argv) { 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc index ef2a21daed..791b5fbdd6 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc @@ -203,7 +203,7 @@ int main(int argc, char **argv) { 3 * f_mem_size, h_x_data, 0, NULL, NULL); CHECK_ERROR("clEnqueueWriteBuffer") - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel); diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile b/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile similarity index 82% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile index 040e2c7994..ba6459d78a 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=args.ll model.ll -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc similarity index 76% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc index 49208f579c..3239be6c92 100644 --- a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc +++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc @@ -14,7 +14,7 @@ #include "args.h" #include "model.h" -#include <hpvm.h> +#include <visc.h> extern unsigned int NUM_SETS; extern unsigned int NUM_ELEMENTS; @@ -62,13 +62,13 @@ void packData(RootIn *args, hist_t *histograms, size_t bytes_histograms, void Allocation(long block) { // Memory shared between threadblocks - // void* data_s = __hpvm__malloc(sizeof(struct cartesian)*BLOCK_SIZE); + // void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE); void *warp_hists = - __hpvm__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); + __visc__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); - //__hpvm__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE, + //__visc__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE, // warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS); - __hpvm__return(2, warp_hists, + __visc__return(2, warp_hists, sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS); } @@ -80,14 +80,14 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, // struct cartesian* data_s, size_t bytes_data_s, unsigned int *warp_hists, size_t bytes_warp_hists) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, all_x_data, binb, 1, histograms); + __visc__hint(visc::DEVICE); + __visc__attributes(2, all_x_data, binb, 1, histograms); - void *thisNode = __hpvm__getNode(); - void *parentNode = __hpvm__getParentNode(thisNode); - int lx = __hpvm__getNodeInstanceID_x(thisNode); - int gx = __hpvm__getNodeInstanceID_x(parentNode); - int dimx = __hpvm__getNumNodeInstances_x(thisNode); + void *thisNode = __visc__getNode(); + void *parentNode = __visc__getParentNode(thisNode); + int lx = __visc__getNodeInstanceID_x(thisNode); + int gx = __visc__getNodeInstanceID_x(parentNode); + int dimx = __visc__getNumNodeInstances_x(thisNode); float *all_y_data = all_x_data + NUM_ELEMENTS * (NUM_SETS + 1); float *all_z_data = all_y_data + NUM_ELEMENTS * (NUM_SETS + 1); @@ -170,7 +170,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, unsigned int warpnum = tid / (WARP_SIZE / HISTS_PER_WARP); if ((distance < binb[min]) && (distance >= binb[max]) && (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) { - __hpvm__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1); + __visc__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1); } } } @@ -181,7 +181,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, for (unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; offset >>= 1) { for (unsigned int bin_base = 0; bin_base < NUM_BINS; bin_base += BLOCK_SIZE / (NUM_HISTOGRAMS >> 1)) { - __hpvm__barrier(); + __visc__barrier(); if (warp_index < offset && bin_base + bin_index < NUM_BINS) { unsigned long sum = warp_hists(bin_base + bin_index, warp_index) + @@ -191,7 +191,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms, } } - __hpvm__barrier(); + __visc__barrier(); // Put the results back in the real histogram // warp_hists(x, 0) holds sum of all locations of bin x @@ -207,26 +207,26 @@ void BlockingTPACF(hist_t *histograms, size_t bytes_histograms, float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, all_x_data, binb, 1, histograms); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, all_x_data, binb, 1, histograms); - void *AllocationNode = __hpvm__createNodeND(0, Allocation); - void *TPACFLeafNode = __hpvm__createNodeND(1, TPACFLeaf, block); + void *AllocationNode = __visc__createNodeND(0, Allocation); + void *TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block); // Bind Inputs - __hpvm__bindIn(AllocationNode, 8, 0, 0); // Bind block - __hpvm__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms - __hpvm__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms - __hpvm__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data - __hpvm__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data - __hpvm__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb - __hpvm__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb - __hpvm__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS - __hpvm__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS + __visc__bindIn(AllocationNode, 8, 0, 0); // Bind block + __visc__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms + __visc__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms + __visc__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data + __visc__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data + __visc__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb + __visc__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb + __visc__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS + __visc__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS // Create Edges - __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists - __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, + __visc__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists + __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, 0); // Edge bytes_warp_hists } @@ -236,21 +236,21 @@ void TPACFRoot(hist_t *histograms, size_t bytes_histograms, float *all_x_data, float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block, long grid) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, all_x_data, binb, 1, histograms); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, all_x_data, binb, 1, histograms); - void *BlockingTPACFNode = __hpvm__createNodeND(1, BlockingTPACF, grid); + void *BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid); // Bind Inputs - __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms - __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms - __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data - __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data - __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb - __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb - __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS - __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS - __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block + __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms + __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms + __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data + __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data + __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb + __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb + __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS + __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS + __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block } void TPACFWrapper(hist_t *histograms, size_t bytes_histograms, @@ -258,22 +258,22 @@ void TPACFWrapper(hist_t *histograms, size_t bytes_histograms, // next arg is read-only constant float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block, long grid) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, all_x_data, binb, 1, histograms); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, all_x_data, binb, 1, histograms); - void *BlockingTPACFNode = __hpvm__createNodeND(0, TPACFRoot); + void *BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot); // Bind Inputs - __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms - __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms - __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data - __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data - __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb - __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb - __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS - __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS - __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block - __hpvm__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid + __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms + __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms + __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data + __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data + __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb + __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb + __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS + __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS + __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block + __visc__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid } // **===-----------------------------------------------------------===** @@ -324,14 +324,14 @@ int main(int argc, char **argv) { } pb_InitializeTimerSet(&timers); - __hpvm__init(); + __visc__init(); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // split into x, y, and z arrays // AOS to SOA transformation size_t bytes_h_x_data = 3 * f_mem_size; float *h_x_data = (float *)malloc(bytes_h_x_data); - llvm_hpvm_track_mem(h_x_data, bytes_h_x_data); + llvm_visc_track_mem(h_x_data, bytes_h_x_data); float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1); float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1); @@ -349,12 +349,12 @@ int main(int argc, char **argv) { // allocate system memory for final histograms size_t bytes_hists = NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t); hist_t *hists = (hist_t *)malloc(bytes_hists); - llvm_hpvm_track_mem(hists, bytes_hists); + llvm_visc_track_mem(hists, bytes_hists); // Initialize the boundary constants for bin search size_t bytes_binb = (NUM_BINS + 1) * sizeof(float); float *binb = (float *)malloc(bytes_binb); - llvm_hpvm_track_mem(binb, bytes_binb); + llvm_visc_track_mem(binb, bytes_binb); for (int k = 0; k < NUM_BINS + 1; k++) { binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) / @@ -369,17 +369,17 @@ int main(int argc, char **argv) { RootIn *graph_args = (RootIn *)malloc(sizeof(RootIn)); packData(graph_args, hists, bytes_hists, h_x_data, bytes_h_x_data, binb, bytes_binb, NUM_SETS, NUM_ELEMENTS, block, grid); - pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION); + pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION); - void *TPACF_DFG = __hpvm__launch(0, TPACFRoot, (void *)graph_args); - __hpvm__wait(TPACF_DFG); + void *TPACF_DFG = __visc__launch(0, TPACFRoot, (void *)graph_args); + __visc__wait(TPACF_DFG); pb_SwitchToTimer(&timers, pb_TimerID_COPY); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // **===-------------------------------------------------------------===** - llvm_hpvm_request_mem(hists, bytes_hists); + llvm_visc_request_mem(hists, bytes_hists); // references into output histograms hist_t *dd_hist = hists; hist_t *rr_hist = dd_hist + NUM_BINS; @@ -407,7 +407,7 @@ int main(int argc, char **argv) { pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); - __hpvm__cleanup(); + __visc__cleanup(); FILE *outfile; if ((outfile = fopen(params->outFile, "w")) == NULL) { diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h similarity index 100% rename from hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h rename to hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h index ba25726c02..30ad6721c3 100644 --- a/hpvm/test/parboil/common/include/parboil.h +++ b/hpvm/test/parboil/common/include/parboil.h @@ -102,23 +102,23 @@ enum pb_TimerID { * host activity: automatically filled in, * not intended for direct usage */ // GPU FUNCTION - hpvm_TimerID_INIT_CTX, - hpvm_TimerID_CLEAR_CTX, - hpvm_TimerID_COPY_SCALAR, - hpvm_TimerID_COPY_PTR, - hpvm_TimerID_MEM_FREE, - hpvm_TimerID_READ_OUTPUT, - hpvm_TimerID_SETUP, - hpvm_TimerID_MEM_TRACK, - hpvm_TimerID_MEM_UNTRACK, - hpvm_TimerID_MISC, + visc_TimerID_INIT_CTX, + visc_TimerID_CLEAR_CTX, + visc_TimerID_COPY_SCALAR, + visc_TimerID_COPY_PTR, + visc_TimerID_MEM_FREE, + visc_TimerID_READ_OUTPUT, + visc_TimerID_SETUP, + visc_TimerID_MEM_TRACK, + visc_TimerID_MEM_UNTRACK, + visc_TimerID_MISC, // LAUNCH FUNCTION - hpvm_TimerID_PTHREAD_CREATE, - hpvm_TimerID_ARG_PACK, - hpvm_TimerID_ARG_UNPACK, - hpvm_TimerID_COMPUTATION, - hpvm_TimerID_OUTPUT_PACK, - hpvm_TimerID_OUTPUT_UNPACK, + visc_TimerID_PTHREAD_CREATE, + visc_TimerID_ARG_PACK, + visc_TimerID_ARG_UNPACK, + visc_TimerID_COMPUTATION, + visc_TimerID_OUTPUT_PACK, + visc_TimerID_OUTPUT_UNPACK, pb_TimerID_LAST /* Number of timer IDs */ }; diff --git a/hpvm/test/parboil/common/mk/hpvm.mk b/hpvm/test/parboil/common/mk/visc.mk similarity index 81% rename from hpvm/test/parboil/common/mk/hpvm.mk rename to hpvm/test/parboil/common/mk/visc.mk index 1c59d4d8fd..eb11371ccd 100755 --- a/hpvm/test/parboil/common/mk/hpvm.mk +++ b/hpvm/test/parboil/common/mk/visc.mk @@ -9,38 +9,38 @@ CFLAGS=$(LANG_CFLAGS) $(PLATFORM_CFLAGS) $(APP_CFLAGS) CXXFLAGS=$(LANG_CXXFLAGS) $(PLATFORM_CXXFLAGS) $(APP_CXXFLAGS) LDFLAGS=$(LANG_LDFLAGS) $(PLATFORM_LDFLAGS) $(APP_LDFLAGS) -# HPVM +# VISC LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs -#HPVM_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/hpvm-rt -HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt +#VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt +VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt -HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll #LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc #LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc LLVM_34_AS = /opt/llvm/bin/llvm-as -TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl ifeq ($(TARGET),x86) DEVICE = SPIR_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG CFLAGS += -DOPENCL_CPU else ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG else ifeq ($(TARGET),seqx86) DEVICE = CPU_OR_SPIR_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG CFLAGS += -DOPENCL_CPU else ifeq ($(TARGET),seqgpu) DEVICE = CPU_OR_GPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG else DEVICE = GPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG endif CFLAGS += -DDEVICE=$(DEVICE) @@ -49,31 +49,31 @@ CXXFLAGS += -DDEVICE=$(DEVICE) HOST_LINKFLAGS = ifeq ($(TIMER),x86) - HPVM_OPTFLAGS += -hpvm-timers-x86 + VISC_OPTFLAGS += -visc-timers-x86 else ifeq ($(TIMER),ptx) - HPVM_OPTFLAGS += -hpvm-timers-ptx + VISC_OPTFLAGS += -visc-timers-ptx else ifeq ($(TIMER),gen) - TESTGEN_OPTFLAGS += -hpvm-timers-gen + TESTGEN_OPTFLAGS += -visc-timers-gen else ifeq ($(TIMER),spir) - TESTGEN_OPTFLAGS += -hpvm-timers-spir + TESTGEN_OPTFLAGS += -visc-timers-spir else ifeq ($(TIMER),no) else ifeq ($(TARGET),x86) - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir else ifeq ($(TARGET),seq) - HPVM_OPTFLAGS += -hpvm-timers-x86 + VISC_OPTFLAGS += -visc-timers-x86 else ifeq ($(TARGET),seqx86) - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir else ifeq ($(TARGET),seqgpu) - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx else - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx endif - TESTGEN_OPTFLAGS += -hpvm-timers-gen + TESTGEN_OPTFLAGS += -visc-timers-gen endif ifeq ($(DABSTRACTION),true) - HPVM_OPTFLAGS += -hpvm-eda + VISC_OPTFLAGS += -visc-eda endif # Rules common to all makefiles @@ -121,7 +121,7 @@ endif ######################################## OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll) KERNEL = $(TEST_OBJS).kernels.ll KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll @@ -190,14 +190,14 @@ $(KERNEL_OPT) : $(KERNEL) $(BIN) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(HPVM_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp +$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILDDIR)/$(HPVM_OBJS) - $(OPT) --debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS) + $(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST) $(RUNDIR) : mkdir -p $(RUNDIR) @@ -214,11 +214,11 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cc $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp $(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@ -$(BUILDDIR)/%.hpvm.ll: $(BUILDDIR)/%.ll +$(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll $(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@ cat $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil/RUN.parboil.script $@ > $@.tmp - mv $@.tmp $(BUILDDIR)/$(APP).hpvm.ll - #@cp $(HPVM_OBJS) $(BUILDDIR)/$(HPVM_OBJS) + mv $@.tmp $(BUILDDIR)/$(APP).visc.ll + #@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS) $(BUILDDIR)/%.o : $(SRCDIR)/%.c $(CC) $(CFLAGS) -c $< -o $@ diff --git a/hpvm/test/parboil/common/platform/hpvm.default.mk b/hpvm/test/parboil/common/platform/visc.default.mk similarity index 61% rename from hpvm/test/parboil/common/platform/hpvm.default.mk rename to hpvm/test/parboil/common/platform/visc.default.mk index ca90d453a3..03a9b0874a 100644 --- a/hpvm/test/parboil/common/platform/hpvm.default.mk +++ b/hpvm/test/parboil/common/platform/visc.default.mk @@ -12,20 +12,20 @@ #OPENCL_LIB_PATH=$(OPENCL_PATH)/lib/x86_64 #build -HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build +VISC_BUILD_DIR = $(LLVM_SRC_ROOT)/../build # gcc (default) -CC = $(HPVM_BUILD_DIR)/bin/clang -OCLBE = $(HPVM_BUILD_DIR)/bin/llvm-cbe -PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include +CC = $(VISC_BUILD_DIR)/bin/clang +OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe +PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include -CXX = $(HPVM_BUILD_DIR)/bin/clang++ -PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include +CXX = $(VISC_BUILD_DIR)/bin/clang++ +PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include -LINKER = $(HPVM_BUILD_DIR)/bin/clang++ +LINKER = $(VISC_BUILD_DIR)/bin/clang++ PLATFORM_LDFLAGS = -lm -lpthread -lOpenCL -LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib -LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin +LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib +LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin OPT = $(LLVM_BIN_PATH)/opt LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link diff --git a/hpvm/test/parboil/driver/options.py b/hpvm/test/parboil/driver/options.py index e15883b753..b80fc16168 100644 --- a/hpvm/test/parboil/driver/options.py +++ b/hpvm/test/parboil/driver/options.py @@ -264,7 +264,7 @@ def time_options(progname, cmd, args): label_ptx = 'NVPTX_Timer' #label_ptx = 'SPIR_Timer' label_x86 = 'X86_Timer' - label_gen = 'GenHPVM_Timer' + label_gen = 'GenVISC_Timer' timings[label_f] = {} timings[label_f]['IO'] = addTime([(label_pb, 'IO')], timings) timings[label_f]['Memory Track'] = addTime([(label_pb, 'Mem_Track')], timings) @@ -297,11 +297,11 @@ def time_options(progname, cmd, args): timerName = 'Parboil' timings[timerName] = {} continue - if line.startswith('Printing HPVM Timer'): - regex = re.search('Printing HPVM Timer: *(?P<name>[a-zA-Z0-9 _]+)', line) + if line.startswith('Printing VISC Timer'): + regex = re.search('Printing VISC Timer: *(?P<name>[a-zA-Z0-9 _]+)', line) timerName = regex.group('name').strip() timings[timerName] = {} - if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenHPVM_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer': + if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenVISC_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer': print "Warning: Found unknown timer " + timerName continue m = re.search('(?P<timerID>[a-zA-Z _/]+) *: *(?P<value>[0-9]*\.[0-9]*) *$', line) @@ -352,67 +352,67 @@ def time_options(progname, cmd, args): globals.verbose = opts.verbose configs = [ - ('spmv', { 'VERSION' : ["opencl_nvidia", "hpvm"], + ('spmv', { 'VERSION' : ["opencl_nvidia", "visc"], 'TEST' : [("large", 10)] } ) - ,('sgemm', { 'VERSION' : ["opencl_nvidia", "hpvm_sh"], + ,('sgemm', { 'VERSION' : ["opencl_nvidia", "visc_sh"], 'TEST' : [("4K", 10)] } ) - ,('lbm', { 'VERSION' : ["opencl_nvidia", "hpvm"], + ,('lbm', { 'VERSION' : ["opencl_nvidia", "visc"], 'TEST' : [("long", 10)] } ) - ,('stencil', { 'VERSION' : ["opencl_base", "hpvm"], + ,('stencil', { 'VERSION' : ["opencl_base", "visc"], 'TEST' : [("large", 10)] } ) - ,('bfs', { 'VERSION' : ["opencl_nvidia", "hpvm"], + ,('bfs', { 'VERSION' : ["opencl_nvidia", "visc"], 'TEST' : [("1M", 10), ("SF", 10)] } ) - ,('tpacf', { 'VERSION' : ["opencl_base", "hpvm"], + ,('tpacf', { 'VERSION' : ["opencl_base", "visc"], 'TEST' : [("large", 10)] } ) - ,('cutcp', { 'VERSION' : ["opencl_nvidia", "hpvm"], + ,('cutcp', { 'VERSION' : ["opencl_nvidia", "visc"], 'TEST' : [("large", 10)] } ) - #('histo', { 'VERSION' : ["opencl_nvidia", "hpvm"], + #('histo', { 'VERSION' : ["opencl_nvidia", "visc"], #'TEST' : [("default", 10), ("large", 10)] #} #) - #('spmv', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], + #('spmv', { 'VERSION' : ["opencl_cpu_baseline", "visc"], #'TEST' : [("large", 10), ("huge", 10)] #} #) - #('sgemm', { 'VERSION' : ["opencl_cpu_sm", "hpvm_sh"], + #('sgemm', { 'VERSION' : ["opencl_cpu_sm", "visc_sh"], #'TEST' : [("medium", 1), ("4K", 1)] #} #) - #('lbm', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], + #('lbm', { 'VERSION' : ["opencl_cpu_baseline", "visc"], #'TEST' : [("short", 10), ("long", 10)] #} #) - #,('stencil', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], + #,('stencil', { 'VERSION' : ["opencl_cpu_baseline", "visc"], #'TEST' : [("default", 10), ("large", 10)] #} #) - #('bfs', { 'VERSION' : ["opencl_cpu_baseline", "hpvm_base"], + #('bfs', { 'VERSION' : ["opencl_cpu_baseline", "visc_base"], #'TEST' : [("1M", 5), ("SF", 5)] #} #) - #,('tpacf', { 'VERSION' : ["opencl_cpu_base", "hpvm"], + #,('tpacf', { 'VERSION' : ["opencl_cpu_base", "visc"], #'TEST' : [("medium", 1), ("large", 1)] #} #) - #,('cutcp', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], + #,('cutcp', { 'VERSION' : ["opencl_cpu_baseline", "visc"], #'TEST' : [("small", 1), ("large", 1)] #} #) - #,('histo', { 'VERSION' : ["opencl_cpu_baseline", "hpvm"], + #,('histo', { 'VERSION' : ["opencl_cpu_baseline", "visc"], #'TEST' : [("default", 1), ("large", 1)] #} #) diff --git a/hpvm/test/parboil/parboilParser.py b/hpvm/test/parboil/parboilParser.py index 5ea1346349..0d1f10b686 100755 --- a/hpvm/test/parboil/parboilParser.py +++ b/hpvm/test/parboil/parboilParser.py @@ -77,7 +77,7 @@ def parseCSVFile(filename): file.close() #print csvDict['a']['b']['c']['d']['e'] - #print csvDict['sgemm']['hpvm']['c']['d']['e'] + #print csvDict['sgemm']['visc']['c']['d']['e'] #print csvDict['sgemm']['opencl_base']['c']['d']['e'] #print csvDict['sgemm']['opencl_base']['small']['d']['e'] #print csvDict['sgemm']['opencl_base']['small']['Final']['e'] @@ -96,14 +96,14 @@ def parseCSVFile(filename): # returns a list of available tests for the given application -# the tests are found based on the hpvm version, because it exists +# the tests are found based on the visc version, because it exists # for all apps in the dict def getTests(app, csvDict): - return csvDict[app]["hpvm"].keys() + return csvDict[app]["visc"].keys() -def isHPVMVersion(version): - return version.startswith("hpvm") +def isViscVersion(version): + return version.startswith("visc") def getAllVersions(csvDict): @@ -142,7 +142,7 @@ def printTimerDecomposition(csvDict, version): # get apps apps = csvDict.keys() - isHPVM = isHPVMVersion(version) + isVisc = isViscVersion(version) # get tests for each app tests = dict() @@ -150,7 +150,7 @@ def printTimerDecomposition(csvDict, version): tests[app] = csvDict[app][version].keys() # list of timer-category pairs - if isHPVM: + if isVisc: timers =[('Final', 'Kernel'), ('Final', 'Load Program Binary'), ('Final', 'Argument Unpack'), @@ -170,7 +170,7 @@ def printTimerDecomposition(csvDict, version): ('Parboil', 'Clear_Ctx'), ('Final', 'Timer Wall - IO'), ('Final', 'IO'), - ('GenHPVM_Timer', 'Timer Wall Time')] + ('GenVISC_Timer', 'Timer Wall Time')] else: timers =[('Final', 'Init_Ctx'), ('Final', 'Arg_Unpack'), diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile index 3fc794393c..421c9a8532 100644 --- a/hpvm/test/pipeline/Makefile +++ b/hpvm/test/pipeline/Makefile @@ -23,12 +23,12 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) EXE = pipeline-$(TARGET) INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR) -INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include +INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include ## BEGIN HPVM MAKEFILE SRCDIR_OBJS= io.ll OBJS_SRC=src/io.cc -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP = $(EXE) APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize @@ -39,21 +39,21 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS) CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) -HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt -HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll +VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll -TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce +TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce ifeq ($(TARGET),seq) DEVICE = CPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG - HPVM_OPTFLAGS += -hpvm-timers-x86 + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 else DEVICE = GPU_TARGET - HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG - HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx endif - TESTGEN_OPTFLAGS += -hpvm-timers-gen + TESTGEN_OPTFLAGS += -visc-timers-gen CFLAGS += -DDEVICE=$(DEVICE) CXXFLAGS += -DDEVICE=$(DEVICE) @@ -64,7 +64,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1)) .PRECIOUS: $(BUILD_DIR)/%.ll OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) -TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) KERNEL = $(TEST_OBJS).kernels.ll ifeq ($(TARGET),seq) @@ -91,14 +91,14 @@ $(KERNEL_OCL) : $(KERNEL) $(EXE) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@ -$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB) +$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) $(LLVM_LINK) $^ -S -o $@ -$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp +$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp make -C $(LLVM_LIB_PATH) -$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS) - $(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST) +$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) + $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) $(BUILD_DIR): mkdir -p $(BUILD_DIR) @@ -109,7 +109,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc $(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $< -$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll - $(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@ +$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll + $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ ## END HPVM MAKEFILE diff --git a/hpvm/test/pipeline/copyToVersions.sh b/hpvm/test/pipeline/copyToVersions.sh index 67551aff2f..3b9c19bad6 100755 --- a/hpvm/test/pipeline/copyToVersions.sh +++ b/hpvm/test/pipeline/copyToVersions.sh @@ -1,12 +1,12 @@ -declare -a versionList=("hpvmGPU" "hpvmVector" "hpvmScalar" "hpvmGPU-Scalar-MaxG" "hpvmVector-Scalar-MaxG" "hpvmGPU-Scalar-ZC" "hpvmVector-Scalar-ZC") +declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC") declare -a fileList=("Makefile" "io.cc" "main.cc") for version in "${versionList[@]}"; do echo $version for filename in "${fileList[@]}"; do - echo cp ./src/hpvm_parallel/$filename ./src/$version/ - cp ./src/hpvm_parallel/$filename ./src/$version/ + echo cp ./src/visc_parallel/$filename ./src/$version/ + cp ./src/visc_parallel/$filename ./src/$version/ done echo done diff --git a/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll b/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll similarity index 95% rename from hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll rename to hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll index 8056cc12ee..06ec055bb7 100644 --- a/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll +++ b/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll @@ -1,4 +1,4 @@ -; ModuleID = 'build/Gradient_default/main.hpvm.ll' +; ModuleID = 'build/Gradient_default/main.visc.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -167,9 +167,9 @@ entry: ; Function Attrs: nounwind uwtable define %emptyStruct @squareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 { entry: - %call3 = tail call i8* @llvm.hpvm.getNode() - %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3) - %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3) + %call3 = tail call i8* @llvm.visc.getNode() + %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3) + %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3) %cmp = icmp slt i32 %call14, %n %cmp3 = icmp slt i32 %call25, %m %or.cond = and i1 %cmp, %cmp3 @@ -198,51 +198,51 @@ if.end: ; preds = %if.then, %entry ; Function Attrs: nounwind uwtable define %emptyStruct.23 @WrapperSquareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 { entry: - %squareRoot.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false) + %squareRoot.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false) + tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false) ret %emptyStruct.23 undef } ; Function Attrs: nounwind uwtable define %emptyStruct.24 @Gradient(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %Sx, i64 %bytesSx, float* nocapture in %Sy, i64 %bytesSy, float* nocapture out %Gx, i64 %bytesGx, float* nocapture out %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n) #2 { entry: - %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*)) - %WrapperSquareRoot.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*)) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false) - %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false) - %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false) + %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*)) + %WrapperSquareRoot.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*)) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false) + %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false) + %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false) ret %emptyStruct.24 undef } @@ -866,7 +866,7 @@ cond.false: ; preds = %land.lhs.true58, %l cond.end: ; preds = %land.lhs.true58 call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.hpvm.init() + call void @llvm.visc.init() %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5 %104 = load i32* %103, align 4, !tbaa !9 %arrayidx.i296 = getelementptr inbounds i32* %103, i64 1 @@ -1137,15 +1137,15 @@ cond.false87: ; preds = %_Z12getNextFrameRN2 unreachable cond.end88: ; preds = %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit335 - call void @llvm_hpvm_track_mem(i8* %150, i64 %mul65) #1 - call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1 - call void @llvm_hpvm_track_mem(i8* %113, i64 36) #1 + call void @llvm_visc_track_mem(i8* %150, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %106, i64 36) #1 + call void @llvm_visc_track_mem(i8* %113, i64 36) #1 %176 = load i8** %data73, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %176, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %176, i64 %mul65) #1 %177 = load i8** %data74, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %177, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %177, i64 %mul65) #1 %178 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %178, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %178, i64 %mul65) #1 %179 = load i8** %data, align 8, !tbaa !5 %180 = bitcast i8* %179 to float* store float* %180, float** %I1.i, align 1, !tbaa !5 @@ -1154,8 +1154,8 @@ cond.end88: ; preds = %_Z12getNextFrameRN2 for.body: ; preds = %for.body, %cond.end88 %j.0480 = phi i32 [ 0, %cond.end88 ], [ %inc, %for.body ] - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false) - call void @llvm.hpvm.wait(i8* %graphID) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false) + call void @llvm.visc.wait(i8* %graphID) %inc = add i32 %j.0480, 1 %exitcond = icmp eq i32 %inc, 2994 br i1 %exitcond, label %for.end, label %for.body @@ -1163,19 +1163,19 @@ for.body: ; preds = %for.body, %cond.end for.end: ; preds = %for.body call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 %181 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_hpvm_request_mem(i8* %181, i64 %mul65) #1 + call void @llvm_visc_request_mem(i8* %181, i64 %mul65) #1 %182 = load i8** %data, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %182) #1 - call void @llvm_hpvm_untrack_mem(i8* %106) #1 - call void @llvm_hpvm_untrack_mem(i8* %113) #1 + call void @llvm_visc_untrack_mem(i8* %182) #1 + call void @llvm_visc_untrack_mem(i8* %106) #1 + call void @llvm_visc_untrack_mem(i8* %113) #1 %183 = load i8** %data73, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %183) #1 + call void @llvm_visc_untrack_mem(i8* %183) #1 %184 = load i8** %data74, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %184) #1 + call void @llvm_visc_untrack_mem(i8* %184) #1 %185 = load i8** %data75, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %185) #1 + call void @llvm_visc_untrack_mem(i8* %185) #1 call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1 %u.i.i.i342 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9 %186 = load %"struct.cv::UMatData"** %u.i.i.i342, align 8, !tbaa !5 @@ -1647,13 +1647,13 @@ declare noalias i8* @malloc(i64) #5 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0 -declare void @llvm_hpvm_track_mem(i8*, i64) #0 +declare void @llvm_visc_track_mem(i8*, i64) #0 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 -declare void @llvm_hpvm_request_mem(i8*, i64) #0 +declare void @llvm_visc_request_mem(i8*, i64) #0 -declare void @llvm_hpvm_untrack_mem(i8*) #0 +declare void @llvm_visc_untrack_mem(i8*) #0 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 @@ -1713,50 +1713,50 @@ entry: declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 ; Function Attrs: nounwind readnone -declare i8* @llvm.hpvm.getNode() #7 +declare i8* @llvm.visc.getNode() #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7 +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7 +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1 +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1 +declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1 +declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind readonly declare float @llvm.sqrt.f32(float) #8 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #1 +declare i8* @llvm.visc.createNode(i8*) #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1 +declare i8* @llvm.visc.launch(i8*, i8*, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #1 +declare void @llvm.visc.wait(i8*) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind define %horizontal.vertical.ty @horizontal_vertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 { entry: - %call3.i = tail call i8* @llvm.hpvm.getNode() #1 - %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1 - %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1 + %call3.i = tail call i8* @llvm.visc.getNode() #1 + %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1 + %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1 %mul.i = mul nsw i32 %call25.i, %n1_n %add.i = add nsw i32 %mul.i, %call14.i %cmp.i = icmp slt i32 %call14.i, %n1_n @@ -2139,25 +2139,25 @@ vertical.exit: ; preds = %if.end42.2.i67.us, ; Function Attrs: nounwind define %WrapperHorizontal.WrapperVertical.ty @WrapperHorizontal_WrapperVertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 { entry: - %horizontal_vertical.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n) - tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false) + %horizontal_vertical.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n) + tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false) + tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false) ret %WrapperHorizontal.WrapperVertical.ty undef } @@ -2172,9 +2172,9 @@ attributes #7 = { nounwind readnone } attributes #8 = { nounwind readonly } attributes #9 = { noreturn nounwind } -!hpvm_hint_gpu = !{!0, !1} -!hpvm_hint_cpu = !{!2, !3, !4} -!hpvm_hint_spir = !{} +!visc_hint_gpu = !{!0, !1} +!visc_hint_cpu = !{!2, !3, !4} +!visc_hint_spir = !{} !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot} !1 = metadata !{%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical} diff --git a/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll b/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll similarity index 95% rename from hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll rename to hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll index aa4a0d19a0..4b04586251 100644 --- a/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll +++ b/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll @@ -1,4 +1,4 @@ -; ModuleID = 'build/Laplacian_default/main.hpvm.ll' +; ModuleID = 'build/Laplacian_default/main.visc.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -170,9 +170,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #1 ; Function Attrs: nounwind uwtable define %emptyStruct @lincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 { entry: - %call3 = tail call i8* @llvm.hpvm.getNode() - %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3) - %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3) + %call3 = tail call i8* @llvm.visc.getNode() + %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3) + %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3) %cmp = icmp slt i32 %call14, %n %cmp3 = icmp slt i32 %call25, %m %or.cond = and i1 %cmp, %cmp3 @@ -202,55 +202,55 @@ if.end: ; preds = %if.then, %entry ; Function Attrs: nounwind uwtable define %emptyStruct.23 @WrapperLincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 { entry: - %lincomb.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false) + %lincomb.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false) + tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false) ret %emptyStruct.23 undef } ; Function Attrs: nounwind uwtable define %emptyStruct.24 @LaplacianEstimate(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %B, i64 %bytesB, float* nocapture out %D, i64 %bytesD, float* nocapture out %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n) #2 { entry: - %WrapperDilate_WrapperErode.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*)) - %WrapperLincomb.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*)) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false) - %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false) - %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false) + %WrapperDilate_WrapperErode.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*)) + %WrapperLincomb.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*)) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false) + tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false) + %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false) + %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false) ret %emptyStruct.24 undef } @@ -873,7 +873,7 @@ cond.false: ; preds = %land.lhs.true58, %l cond.end: ; preds = %land.lhs.true58 call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.hpvm.init() + call void @llvm.visc.init() %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5 %104 = load i32* %103, align 4, !tbaa !9 %arrayidx.i290 = getelementptr inbounds i32* %103, i64 1 @@ -1062,18 +1062,18 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328, call void @llvm.lifetime.end(i64 24, i8* %134) #1 %data = getelementptr inbounds %"class.cv::Mat"* %src, i64 0, i32 4 %139 = load i8** %data, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %139, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %139, i64 %mul65) #1 %arraydecay = getelementptr inbounds [9 x float]* %B, i64 0, i64 0 - call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1 + call void @llvm_visc_track_mem(i8* %106, i64 36) #1 %data81 = getelementptr inbounds %"class.cv::Mat"* %D, i64 0, i32 4 %140 = load i8** %data81, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %140, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %140, i64 %mul65) #1 %data82 = getelementptr inbounds %"class.cv::Mat"* %E, i64 0, i32 4 %141 = load i8** %data82, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %141, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %141, i64 %mul65) #1 %data83 = getelementptr inbounds %"class.cv::Mat"* %L, i64 0, i32 4 %142 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_hpvm_track_mem(i8* %142, i64 %mul65) #1 + call void @llvm_visc_track_mem(i8* %142, i64 %mul65) #1 %143 = load i8** %data, align 8, !tbaa !5 %144 = bitcast i8* %143 to float* %145 = load i8** %data81, align 8, !tbaa !5 @@ -1126,8 +1126,8 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328, for.body: ; preds = %for.body, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 %j.0474 = phi i32 [ 0, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 ], [ %inc, %for.body ] - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false) - call void @llvm.hpvm.wait(i8* %graphID) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false) + call void @llvm.visc.wait(i8* %graphID) %inc = add nsw i32 %j.0474, 1 %exitcond = icmp eq i32 %inc, 2994 br i1 %exitcond, label %for.end, label %for.body @@ -1135,18 +1135,18 @@ for.body: ; preds = %for.body, %_Z12getN for.end: ; preds = %for.body call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1 %165 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_hpvm_request_mem(i8* %165, i64 %mul65) #1 + call void @llvm_visc_request_mem(i8* %165, i64 %mul65) #1 %166 = load i8** %data, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %166) #1 - call void @llvm_hpvm_untrack_mem(i8* %106) #1 + call void @llvm_visc_untrack_mem(i8* %166) #1 + call void @llvm_visc_untrack_mem(i8* %106) #1 %167 = load i8** %data81, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %167) #1 + call void @llvm_visc_untrack_mem(i8* %167) #1 %168 = load i8** %data82, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %168) #1 + call void @llvm_visc_untrack_mem(i8* %168) #1 %169 = load i8** %data83, align 8, !tbaa !5 - call void @llvm_hpvm_untrack_mem(i8* %169) #1 + call void @llvm_visc_untrack_mem(i8* %169) #1 call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1 %u.i.i.i336 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9 %170 = load %"struct.cv::UMatData"** %u.i.i.i336, align 8, !tbaa !5 @@ -1614,13 +1614,13 @@ declare noalias i8* @malloc(i64) #5 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0 -declare void @llvm_hpvm_track_mem(i8*, i64) #0 +declare void @llvm_visc_track_mem(i8*, i64) #0 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0 -declare void @llvm_hpvm_request_mem(i8*, i64) #0 +declare void @llvm_visc_request_mem(i8*, i64) #0 -declare void @llvm_hpvm_untrack_mem(i8*) #0 +declare void @llvm_visc_untrack_mem(i8*) #0 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0 @@ -1677,47 +1677,47 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 ; Function Attrs: nounwind readnone -declare i8* @llvm.hpvm.getNode() #7 +declare i8* @llvm.visc.getNode() #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7 +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7 ; Function Attrs: nounwind readnone -declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7 +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1 +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1 +declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1 +declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #1 +declare i8* @llvm.visc.createNode(i8*) #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1 +declare i8* @llvm.visc.launch(i8*, i8*, i1) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #1 +declare void @llvm.visc.wait(i8*) #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind define %dilate.erode.ty @dilate_erode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 { entry: - %call3.i = tail call i8* @llvm.hpvm.getNode() #1 - %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1 - %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1 + %call3.i = tail call i8* @llvm.visc.getNode() #1 + %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1 + %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1 %cmp.i = icmp slt i32 %call14.i, %n1_n %cmp3.i = icmp slt i32 %call25.i, %n1_m %or.cond.i = and i1 %cmp.i, %cmp3.i @@ -2070,25 +2070,25 @@ erode.exit: ; preds = %dilate.exit, %cond. ; Function Attrs: nounwind define %WrapperDilate.WrapperErode.ty @WrapperDilate_WrapperErode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 { entry: - %dilate_erode.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n) - tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false) - tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false) - tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false) + %dilate_erode.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n) + tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false) + tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false) + tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false) ret %WrapperDilate.WrapperErode.ty undef } @@ -2103,9 +2103,9 @@ attributes #7 = { nounwind readnone } attributes #8 = { noreturn nounwind } attributes #9 = { nounwind readonly } -!hpvm_hint_gpu = !{!0, !1} -!hpvm_hint_cpu = !{!2, !3, !4} -!hpvm_hint_spir = !{} +!visc_hint_gpu = !{!0, !1} +!visc_hint_cpu = !{!2, !3, !4} +!visc_hint_spir = !{} !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb} !1 = metadata !{%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode} diff --git a/hpvm/test/pipeline/run.sh b/hpvm/test/pipeline/run.sh index 5ac734026b..0c8435764b 100755 --- a/hpvm/test/pipeline/run.sh +++ b/hpvm/test/pipeline/run.sh @@ -4,7 +4,7 @@ echo Pipeline Script $1 $2 version=$1 pos=$2 -if [[ ($version == *"GPU"*) || ($version == "hpvm_parallel") ]] +if [[ ($version == *"GPU"*) || ($version == "visc_parallel") ]] then target="" elif [[ $version == *"Vector"* ]] diff --git a/hpvm/test/pipeline/runscript.sh b/hpvm/test/pipeline/runscript.sh index c95af8f831..5a2933e788 100755 --- a/hpvm/test/pipeline/runscript.sh +++ b/hpvm/test/pipeline/runscript.sh @@ -2,21 +2,21 @@ echo Pipeline Script # Compile all version -make VERSION=hpvmGPU clean -make VERSION=hpvmVector TARGET=x86 clean -make VERSION=hpvmScalar TARGET=seq clean +make VERSION=viscGPU clean +make VERSION=viscVector TARGET=x86 clean +make VERSION=viscScalar TARGET=seq clean -make VERSION=hpvmGPU -make VERSION=hpvmVector TARGET=x86 -make VERSION=hpvmScalar TARGET=seq +make VERSION=viscGPU +make VERSION=viscVector TARGET=x86 +make VERSION=viscScalar TARGET=seq #Run all version -make VERSION=hpvmGPU run & +make VERSION=viscGPU run & ID_GPU=$! -make VERSION=hpvmVector TARGET=x86 run & +make VERSION=viscVector TARGET=x86 run & ID_Vector=$! -make VERSION=hpvmScalar TARGET=seq run +make VERSION=viscScalar TARGET=seq run ID_Scalar=$! #echo Wait 60 seconds diff --git a/hpvm/test/pipeline/src/Makefile b/hpvm/test/pipeline/src/Makefile index 55acb2e098..ec39b86f1c 100644 --- a/hpvm/test/pipeline/src/Makefile +++ b/hpvm/test/pipeline/src/Makefile @@ -1,8 +1,8 @@ # (c) 2010 The Board of Trustees of the University of Illinois. -LANGUAGE=hpvm +LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o -HPVM_OBJS=main.hpvm.ll +VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc index ef9d8412c7..9314833d25 100644 --- a/hpvm/test/pipeline/src/main.cc +++ b/hpvm/test/pipeline/src/main.cc @@ -13,7 +13,6 @@ #include "opencv2/ocl/ocl.hpp" #include "opencv2/opencv.hpp" #include <cassert> -#include <hpvm.h> #include <iostream> #include <malloc.h> #include <math.h> @@ -21,6 +20,7 @@ #include <stdlib.h> #include <string.h> #include <sys/time.h> +#include <visc.h> #define NUM_RUNS 100 #define DEPTH 3 @@ -147,12 +147,12 @@ void packData(struct InStruct *args, float *I, size_t bytesI, float *Is, void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, float *Is, size_t bytesIs, long m, long n) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, I, Gs, 1, Is); + __visc__hint(visc::DEVICE); + __visc__attributes(2, I, Gs, 1, Is); - void *thisNode = __hpvm__getNode(); - long gx = __hpvm__getNodeInstanceID_x(thisNode); - long gy = __hpvm__getNodeInstanceID_y(thisNode); + void *thisNode = __visc__getNode(); + long gx = __visc__getNodeInstanceID_x(thisNode); + long gy = __visc__getNodeInstanceID_y(thisNode); int gloc = gx + gy * n; @@ -187,26 +187,26 @@ void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, Is[gloc] = smoothedVal; } - __hpvm__return(2, bytesIs, bytesIs); + __visc__return(2, bytesIs, bytesIs); } void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs, float *Is, size_t bytesIs, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, I, Gs, 1, Is); - void *GSNode = __hpvm__createNodeND(2, gaussianSmoothing, m, n); - __hpvm__bindIn(GSNode, 0, 0, 0); // Bind I - __hpvm__bindIn(GSNode, 1, 1, 0); // Bind bytesI - __hpvm__bindIn(GSNode, 2, 2, 0); // Bind Gs - __hpvm__bindIn(GSNode, 3, 3, 0); // Bind bytesGs - __hpvm__bindIn(GSNode, 4, 4, 0); // Bind Is - __hpvm__bindIn(GSNode, 5, 5, 0); // Bind bytesIs - __hpvm__bindIn(GSNode, 6, 6, 0); // Bind m - __hpvm__bindIn(GSNode, 7, 7, 0); // Bind n - - __hpvm__bindOut(GSNode, 0, 0, 0); // bind output bytesIs - __hpvm__bindOut(GSNode, 1, 1, 0); // bind output bytesIs + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, I, Gs, 1, Is); + void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n); + __visc__bindIn(GSNode, 0, 0, 0); // Bind I + __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI + __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs + __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs + __visc__bindIn(GSNode, 4, 4, 0); // Bind Is + __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs + __visc__bindIn(GSNode, 6, 6, 0); // Bind m + __visc__bindIn(GSNode, 7, 7, 0); // Bind n + + __visc__bindOut(GSNode, 0, 0, 0); // bind output bytesIs + __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs } /* Compute a non-linear laplacian estimate of input image I of size m x n */ @@ -220,14 +220,14 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs, void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float *L, size_t bytesL, long m, long n) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(2, Is, B, 1, L); + __visc__hint(visc::DEVICE); + __visc__attributes(2, Is, B, 1, L); // 3x3 image area float imageArea[SZB * SZB]; - void *thisNode = __hpvm__getNode(); - long gx = __hpvm__getNodeInstanceID_x(thisNode); - long gy = __hpvm__getNodeInstanceID_y(thisNode); + void *thisNode = __visc__getNode(); + long gx = __visc__getNodeInstanceID_x(thisNode); + long gy = __visc__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { @@ -300,25 +300,25 @@ void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1]; L[gy * n + gx] = laplacian; } - __hpvm__return(1, bytesL); + __visc__return(1, bytesL); } void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB, float *L, size_t bytesL, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, Is, B, 1, L); - void *LNode = __hpvm__createNodeND(2, laplacianEstimate, m, n); - __hpvm__bindIn(LNode, 0, 0, 0); // Bind Is - __hpvm__bindIn(LNode, 1, 1, 0); // Bind bytesIs - __hpvm__bindIn(LNode, 2, 2, 0); // Bind B - __hpvm__bindIn(LNode, 3, 3, 0); // Bind bytesB - __hpvm__bindIn(LNode, 4, 4, 0); // Bind L - __hpvm__bindIn(LNode, 5, 5, 0); // Bind bytesL - __hpvm__bindIn(LNode, 6, 6, 0); // Bind m - __hpvm__bindIn(LNode, 7, 7, 0); // Bind n - - __hpvm__bindOut(LNode, 0, 0, 0); // bind output bytesL + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, Is, B, 1, L); + void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n); + __visc__bindIn(LNode, 0, 0, 0); // Bind Is + __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(LNode, 2, 2, 0); // Bind B + __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 0); // Bind L + __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL + __visc__bindIn(LNode, 6, 6, 0); // Bind m + __visc__bindIn(LNode, 7, 7, 0); // Bind n + + __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL } /* Compute the zero crossings of input image L of size m x n */ @@ -331,16 +331,16 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B, */ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float *S, size_t bytesS, long m, long n) { - __hpvm__hint(hpvm::DEVICE); - //__hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, L, B, 1, S); + __visc__hint(visc::DEVICE); + //__visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); // 3x3 image area float imageArea[SZB][SZB]; - void *thisNode = __hpvm__getNode(); - long gx = __hpvm__getNodeInstanceID_x(thisNode); - long gy = __hpvm__getNodeInstanceID_y(thisNode); + void *thisNode = __visc__getNode(); + long gx = __visc__getNodeInstanceID_x(thisNode); + long gy = __visc__getNodeInstanceID_y(thisNode); int i, j; if ((gx < n) && (gy < m)) { @@ -416,25 +416,25 @@ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float pixelSign = dilatedPixel - erodedPixel; S[gy * n + gx] = pixelSign; } - __hpvm__return(1, bytesS); + __visc__return(1, bytesS); } void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB, float *S, size_t bytesS, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, L, B, 1, S); - void *ZCNode = __hpvm__createNodeND(2, computeZeroCrossings, m, n); - __hpvm__bindIn(ZCNode, 0, 0, 0); // Bind L - __hpvm__bindIn(ZCNode, 1, 1, 0); // Bind bytesL - __hpvm__bindIn(ZCNode, 2, 2, 0); // Bind B - __hpvm__bindIn(ZCNode, 3, 3, 0); // Bind bytesB - __hpvm__bindIn(ZCNode, 4, 4, 0); // Bind S - __hpvm__bindIn(ZCNode, 5, 5, 0); // Bind bytesS - __hpvm__bindIn(ZCNode, 6, 6, 0); // Bind m - __hpvm__bindIn(ZCNode, 7, 7, 0); // Bind n - - __hpvm__bindOut(ZCNode, 0, 0, 0); // bind output bytesS + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, L, B, 1, S); + void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n); + __visc__bindIn(ZCNode, 0, 0, 0); // Bind L + __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL + __visc__bindIn(ZCNode, 2, 2, 0); // Bind B + __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB + __visc__bindIn(ZCNode, 4, 4, 0); // Bind S + __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS + __visc__bindIn(ZCNode, 6, 6, 0); // Bind m + __visc__bindIn(ZCNode, 7, 7, 0); // Bind n + + __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS } /* @@ -458,12 +458,12 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, float *G, size_t bytesG, long m, long n) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(3, Is, Sx, Sy, 1, G); + __visc__hint(visc::DEVICE); + __visc__attributes(3, Is, Sx, Sy, 1, G); - void *thisNode = __hpvm__getNode(); - long gx = __hpvm__getNodeInstanceID_x(thisNode); - long gy = __hpvm__getNodeInstanceID_y(thisNode); + void *thisNode = __visc__getNode(); + long gx = __visc__getNodeInstanceID_x(thisNode); + long gy = __visc__getNodeInstanceID_y(thisNode); int gloc = gx + gy * n; @@ -498,27 +498,27 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, G[gloc] = sqrt(Gx * Gx + Gy * Gy); } - __hpvm__return(1, bytesG); + __visc__return(1, bytesG); } void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, float *G, size_t bytesG, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, Is, Sx, Sy, 1, G); - void *CGNode = __hpvm__createNodeND(2, computeGradient, m, n); - __hpvm__bindIn(CGNode, 0, 0, 0); // Bind Is - __hpvm__bindIn(CGNode, 1, 1, 0); // Bind bytesIs - __hpvm__bindIn(CGNode, 2, 2, 0); // Bind Sx - __hpvm__bindIn(CGNode, 3, 3, 0); // Bind bytesSx - __hpvm__bindIn(CGNode, 4, 4, 0); // Bind Sy - __hpvm__bindIn(CGNode, 5, 5, 0); // Bind bytesSy - __hpvm__bindIn(CGNode, 6, 6, 0); // Bind G - __hpvm__bindIn(CGNode, 7, 7, 0); // Bind bytesG - __hpvm__bindIn(CGNode, 8, 8, 0); // Bind m - __hpvm__bindIn(CGNode, 9, 9, 0); // Bind n - - __hpvm__bindOut(CGNode, 0, 0, 0); // bind output bytesG + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, Is, Sx, Sy, 1, G); + void *CGNode = __visc__createNodeND(2, computeGradient, m, n); + __visc__bindIn(CGNode, 0, 0, 0); // Bind Is + __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs + __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx + __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx + __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy + __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy + __visc__bindIn(CGNode, 6, 6, 0); // Bind G + __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG + __visc__bindIn(CGNode, 8, 8, 0); // Bind m + __visc__bindIn(CGNode, 9, 9, 0); // Bind n + + __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG } /* @@ -531,13 +531,13 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx, void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(1, G, 1, maxG); + __visc__hint(visc::CPU_TARGET); + __visc__attributes(1, G, 1, maxG); - void *thisNode = __hpvm__getNode(); + void *thisNode = __visc__getNode(); - long lx = __hpvm__getNodeInstanceID_x(thisNode); // threadIdx.x - long dimx = __hpvm__getNumNodeInstances_x(thisNode); // blockDim.x + long lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x + long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x // Assume a single thread block // Thread block iterates over all elements @@ -556,39 +556,39 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, *maxG = G[lx]; } - __hpvm__return(1, bytesMaxG); + __visc__return(1, bytesMaxG); } void computeMaxGradientTB(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n, long block_x) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, G, maxG, 1, maxG); - void *CMGLeafNode = __hpvm__createNodeND(1, computeMaxGradientLeaf, block_x); - __hpvm__bindIn(CMGLeafNode, 0, 0, 0); // Bind G - __hpvm__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG - __hpvm__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG - __hpvm__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG - __hpvm__bindIn(CMGLeafNode, 4, 4, 0); // Bind m - __hpvm__bindIn(CMGLeafNode, 5, 5, 0); // Bind n - - __hpvm__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x); + __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n + + __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG } void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n, long block_x, long grid_x) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(2, G, maxG, 1, maxG); - void *CMGTBNode = __hpvm__createNodeND(1, computeMaxGradientTB, grid_x); - __hpvm__bindIn(CMGTBNode, 0, 0, 0); // Bind G - __hpvm__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG - __hpvm__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG - __hpvm__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG - __hpvm__bindIn(CMGTBNode, 4, 4, 0); // Bind m - __hpvm__bindIn(CMGTBNode, 5, 5, 0); // Bind n - __hpvm__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x - - __hpvm__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG + __visc__hint(visc::CPU_TARGET); + __visc__attributes(2, G, maxG, 1, maxG); + void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x); + __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G + __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG + __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG + __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG + __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m + __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n + __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x + + __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG } /* Reject the zero crossings where the gradient is below a threshold */ @@ -604,39 +604,39 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG, void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E, size_t bytesE, long m, long n) { - __hpvm__hint(hpvm::DEVICE); - __hpvm__attributes(3, S, G, maxG, 1, E); + __visc__hint(visc::DEVICE); + __visc__attributes(3, S, G, maxG, 1, E); - void *thisNode = __hpvm__getNode(); - int gx = __hpvm__getNodeInstanceID_x(thisNode); - int gy = __hpvm__getNodeInstanceID_y(thisNode); + void *thisNode = __visc__getNode(); + int gx = __visc__getNodeInstanceID_x(thisNode); + int gy = __visc__getNodeInstanceID_y(thisNode); float mG = *maxG; if ((gx < n) && (gy < m)) { E[gy * n + gx] = ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0; } - __hpvm__return(1, bytesE); + __visc__return(1, bytesE); } void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E, size_t bytesE, long m, long n) { - __hpvm__hint(hpvm::CPU_TARGET); - __hpvm__attributes(3, S, G, maxG, 1, E); - void *RZCNode = __hpvm__createNodeND(2, rejectZeroCrossings, m, n); - __hpvm__bindIn(RZCNode, 0, 0, 0); // Bind S - __hpvm__bindIn(RZCNode, 1, 1, 0); // Bind bytesS - __hpvm__bindIn(RZCNode, 2, 2, 0); // Bind G - __hpvm__bindIn(RZCNode, 3, 3, 0); // Bind bytesG - __hpvm__bindIn(RZCNode, 4, 4, 0); // Bind maxG - __hpvm__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG - __hpvm__bindIn(RZCNode, 6, 6, 0); // Bind E - __hpvm__bindIn(RZCNode, 7, 7, 0); // Bind bytesE - __hpvm__bindIn(RZCNode, 8, 8, 0); // Bind m - __hpvm__bindIn(RZCNode, 9, 9, 0); // Bind n - - __hpvm__bindOut(RZCNode, 0, 0, 0); // bind output bytesE + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, S, G, maxG, 1, E); + void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n); + __visc__bindIn(RZCNode, 0, 0, 0); // Bind S + __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS + __visc__bindIn(RZCNode, 2, 2, 0); // Bind G + __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG + __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG + __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG + __visc__bindIn(RZCNode, 6, 6, 0); // Bind E + __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE + __visc__bindIn(RZCNode, 8, 8, 0); // Bind m + __visc__bindIn(RZCNode, 9, 9, 0); // Bind n + + __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE } // Pipelined Root node @@ -656,80 +656,80 @@ void edgeDetection(float *I, size_t bytesI, // 0 long block_x, // 24 long grid_x // 25 ) { - __hpvm__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); - __hpvm__hint(hpvm::CPU_TARGET); - void *GSNode = __hpvm__createNodeND(0, WrapperGaussianSmoothing); - void *LNode = __hpvm__createNodeND(0, WrapperlaplacianEstimate); - void *CZCNode = __hpvm__createNodeND(0, WrapperComputeZeroCrossings); - void *CGNode = __hpvm__createNodeND(0, WrapperComputeGradient); - void *CMGNode = __hpvm__createNodeND(0, WrapperComputeMaxGradient); - void *RZCNode = __hpvm__createNodeND(0, WrapperRejectZeroCrossings); + __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); + __visc__hint(visc::CPU_TARGET); + void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing); + void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate); + void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings); + void *CGNode = __visc__createNodeND(0, WrapperComputeGradient); + void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient); + void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings); // Gaussian Inputs - __hpvm__bindIn(GSNode, 0, 0, 1); // Bind I - __hpvm__bindIn(GSNode, 1, 1, 1); // Bind bytesI - __hpvm__bindIn(GSNode, 14, 2, 1); // Bind Gs - __hpvm__bindIn(GSNode, 15, 3, 1); // Bind bytesGs - __hpvm__bindIn(GSNode, 2, 4, 1); // Bind Is - __hpvm__bindIn(GSNode, 3, 5, 1); // Bind bytesIs - __hpvm__bindIn(GSNode, 22, 6, 1); // Bind m - __hpvm__bindIn(GSNode, 23, 7, 1); // Bind n + __visc__bindIn(GSNode, 0, 0, 1); // Bind I + __visc__bindIn(GSNode, 1, 1, 1); // Bind bytesI + __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs + __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs + __visc__bindIn(GSNode, 2, 4, 1); // Bind Is + __visc__bindIn(GSNode, 3, 5, 1); // Bind bytesIs + __visc__bindIn(GSNode, 22, 6, 1); // Bind m + __visc__bindIn(GSNode, 23, 7, 1); // Bind n // Laplacian Inputs - __hpvm__bindIn(LNode, 2, 0, 1); // Bind Is - __hpvm__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs - __hpvm__bindIn(LNode, 16, 2, 1); // Bind B - __hpvm__bindIn(LNode, 17, 3, 1); // Bind bytesB - __hpvm__bindIn(LNode, 4, 4, 1); // Bind L - __hpvm__bindIn(LNode, 5, 5, 1); // Bind bytesL - __hpvm__bindIn(LNode, 22, 6, 1); // Bind m - __hpvm__bindIn(LNode, 23, 7, 1); // Bind n + __visc__bindIn(LNode, 2, 0, 1); // Bind Is + __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs + __visc__bindIn(LNode, 16, 2, 1); // Bind B + __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(LNode, 4, 4, 1); // Bind L + __visc__bindIn(LNode, 5, 5, 1); // Bind bytesL + __visc__bindIn(LNode, 22, 6, 1); // Bind m + __visc__bindIn(LNode, 23, 7, 1); // Bind n // Compute ZC Inputs - __hpvm__bindIn(CZCNode, 4, 0, 1); // Bind L - __hpvm__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL - __hpvm__bindIn(CZCNode, 16, 2, 1); // Bind B - __hpvm__bindIn(CZCNode, 17, 3, 1); // Bind bytesB - __hpvm__bindIn(CZCNode, 6, 4, 1); // Bind S - __hpvm__bindIn(CZCNode, 7, 5, 1); // Bind bytesS - __hpvm__bindIn(CZCNode, 22, 6, 1); // Bind m - __hpvm__bindIn(CZCNode, 23, 7, 1); // Bind n + __visc__bindIn(CZCNode, 4, 0, 1); // Bind L + __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL + __visc__bindIn(CZCNode, 16, 2, 1); // Bind B + __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB + __visc__bindIn(CZCNode, 6, 4, 1); // Bind S + __visc__bindIn(CZCNode, 7, 5, 1); // Bind bytesS + __visc__bindIn(CZCNode, 22, 6, 1); // Bind m + __visc__bindIn(CZCNode, 23, 7, 1); // Bind n // Gradient Inputs - __hpvm__bindIn(CGNode, 2, 0, 1); // Bind Is - __hpvm__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs - __hpvm__bindIn(CGNode, 18, 2, 1); // Bind Sx - __hpvm__bindIn(CGNode, 19, 3, 1); // Bind bytesSx - __hpvm__bindIn(CGNode, 20, 4, 1); // Bind Sy - __hpvm__bindIn(CGNode, 21, 5, 1); // Bind bytesSy - __hpvm__bindIn(CGNode, 8, 6, 1); // Bind G - __hpvm__bindIn(CGNode, 9, 7, 1); // Bind bytesG - __hpvm__bindIn(CGNode, 22, 8, 1); // Bind m - __hpvm__bindIn(CGNode, 23, 9, 1); // Bind n + __visc__bindIn(CGNode, 2, 0, 1); // Bind Is + __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs + __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx + __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx + __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy + __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy + __visc__bindIn(CGNode, 8, 6, 1); // Bind G + __visc__bindIn(CGNode, 9, 7, 1); // Bind bytesG + __visc__bindIn(CGNode, 22, 8, 1); // Bind m + __visc__bindIn(CGNode, 23, 9, 1); // Bind n // Max Gradient Inputs - __hpvm__bindIn(CMGNode, 8, 0, 1); // Bind G - __hpvm__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG - __hpvm__bindIn(CMGNode, 10, 2, 1); // Bind maxG - __hpvm__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG - __hpvm__bindIn(CMGNode, 22, 4, 1); // Bind m - __hpvm__bindIn(CMGNode, 23, 5, 1); // Bind n - __hpvm__bindIn(CMGNode, 24, 6, 1); // Bind block_x - __hpvm__bindIn(CMGNode, 25, 7, 1); // Bind grid_x + __visc__bindIn(CMGNode, 8, 0, 1); // Bind G + __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG + __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG + __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG + __visc__bindIn(CMGNode, 22, 4, 1); // Bind m + __visc__bindIn(CMGNode, 23, 5, 1); // Bind n + __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x + __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x // Reject ZC Inputs - __hpvm__bindIn(RZCNode, 6, 0, 1); // Bind S - __hpvm__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS - __hpvm__bindIn(RZCNode, 8, 2, 1); // Bind G - __hpvm__bindIn(RZCNode, 9, 3, 1); // Bind bytesG - __hpvm__bindIn(RZCNode, 10, 4, 1); // Bind maxG - __hpvm__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG - __hpvm__bindIn(RZCNode, 12, 6, 1); // Bind E - __hpvm__bindIn(RZCNode, 13, 7, 1); // Bind bytesE - __hpvm__bindIn(RZCNode, 22, 8, 1); // Bind m - __hpvm__bindIn(RZCNode, 23, 9, 1); // Bind n - - __hpvm__bindOut(RZCNode, 0, 0, 1); // Bind output + __visc__bindIn(RZCNode, 6, 0, 1); // Bind S + __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS + __visc__bindIn(RZCNode, 8, 2, 1); // Bind G + __visc__bindIn(RZCNode, 9, 3, 1); // Bind bytesG + __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG + __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG + __visc__bindIn(RZCNode, 12, 6, 1); // Bind E + __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE + __visc__bindIn(RZCNode, 22, 8, 1); // Bind m + __visc__bindIn(RZCNode, 23, 9, 1); // Bind n + + __visc__bindOut(RZCNode, 0, 0, 1); // Bind output } } @@ -796,7 +796,7 @@ int main(int argc, char *argv[]) { assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous()); - __hpvm__init(); + __visc__init(); // copy A to device memory I_sz = src.size[0] * src.size[1] * sizeof(float); @@ -843,7 +843,7 @@ int main(int argc, char *argv[]) { for (unsigned j = 0; j < NUM_RUNS; j++) { std::cout << "Run: " << j << "\n"; - void *DFG = __hpvm__launch(1, edgeDetection, (void *)args); + void *DFG = __visc__launch(1, edgeDetection, (void *)args); cap = VideoCapture(inFile); getNextFrame(cap, src); @@ -855,25 +855,25 @@ int main(int argc, char *argv[]) { *maxG = 0.0; - llvm_hpvm_track_mem(src.data, I_sz); - llvm_hpvm_track_mem(Is.data, I_sz); - llvm_hpvm_track_mem(L.data, I_sz); - llvm_hpvm_track_mem(S.data, I_sz); - llvm_hpvm_track_mem(G.data, I_sz); - llvm_hpvm_track_mem(maxG, bytesMaxG); - llvm_hpvm_track_mem(E.data, I_sz); - llvm_hpvm_track_mem(Gs, bytesGs); - llvm_hpvm_track_mem(B, bytesB); - llvm_hpvm_track_mem(Sx, bytesSx); - llvm_hpvm_track_mem(Sy, bytesSy); - - __hpvm__push(DFG, args); - void *ret = __hpvm__pop(DFG); + llvm_visc_track_mem(src.data, I_sz); + llvm_visc_track_mem(Is.data, I_sz); + llvm_visc_track_mem(L.data, I_sz); + llvm_visc_track_mem(S.data, I_sz); + llvm_visc_track_mem(G.data, I_sz); + llvm_visc_track_mem(maxG, bytesMaxG); + llvm_visc_track_mem(E.data, I_sz); + llvm_visc_track_mem(Gs, bytesGs); + llvm_visc_track_mem(B, bytesB); + llvm_visc_track_mem(Sx, bytesSx); + llvm_visc_track_mem(Sy, bytesSy); + + __visc__push(DFG, args); + void *ret = __visc__pop(DFG); std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz << '\n'; - llvm_hpvm_request_mem(maxG, bytesMaxG); - llvm_hpvm_request_mem(E.data, I_sz); + llvm_visc_request_mem(maxG, bytesMaxG); + llvm_visc_request_mem(E.data, I_sz); Mat in, out; resize(src, in, Size(HEIGHT, WIDTH)); @@ -882,26 +882,26 @@ int main(int argc, char *argv[]) { imshow(input_window, in); waitKey(1); - llvm_hpvm_untrack_mem(src.data); - llvm_hpvm_untrack_mem(Is.data); - llvm_hpvm_untrack_mem(L.data); - llvm_hpvm_untrack_mem(S.data); - llvm_hpvm_untrack_mem(G.data); - llvm_hpvm_untrack_mem(maxG); - llvm_hpvm_untrack_mem(E.data); - llvm_hpvm_untrack_mem(Gs); - llvm_hpvm_untrack_mem(B); - llvm_hpvm_untrack_mem(Sx); - llvm_hpvm_untrack_mem(Sy); + llvm_visc_untrack_mem(src.data); + llvm_visc_untrack_mem(Is.data); + llvm_visc_untrack_mem(L.data); + llvm_visc_untrack_mem(S.data); + llvm_visc_untrack_mem(G.data); + llvm_visc_untrack_mem(maxG); + llvm_visc_untrack_mem(E.data); + llvm_visc_untrack_mem(Gs); + llvm_visc_untrack_mem(B); + llvm_visc_untrack_mem(Sx); + llvm_visc_untrack_mem(Sy); getNextFrame(cap, src); } } else { - __hpvm__push(DFG, args); - __hpvm__pop(DFG); + __visc__push(DFG, args); + __visc__pop(DFG); } - __hpvm__wait(DFG); + __visc__wait(DFG); } - __hpvm__cleanup(); + __visc__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c index c3f58c95d6..1b6b1cff21 100644 --- a/hpvm/test/unitTests/CreateNodeAndEdge.c +++ b/hpvm/test/unitTests/CreateNodeAndEdge.c @@ -1,4 +1,4 @@ -#include "hpvm.h" +#include "visc.h" #include <stdio.h> struct Root { @@ -7,33 +7,33 @@ struct Root { }; void Func1(int *In, int *Out) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(1, In, 1, Out); + __visc__hint(CPU_TARGET); + __visc__attributes(1, In, 1, Out); - __hpvm__return(1, Out); + __visc__return(1, Out); } void Func2(int *BindIn, int *SrcIn, int *Out) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(2, BindIn, SrcIn, 1, Out); + __visc__hint(CPU_TARGET); + __visc__attributes(2, BindIn, SrcIn, 1, Out); - __hpvm__return(1, Out); + __visc__return(1, Out); } void PipeRoot(int *In, int *Out) { - __hpvm__hint(CPU_TARGET); + __visc__hint(CPU_TARGET); - __hpvm__attributes(1, In, 1, Out); + __visc__attributes(1, In, 1, Out); - void *SrcNode = __hpvm__createNodeND(0, Func1); - void *DestNode = __hpvm__createNodeND(0, Func2); + void *SrcNode = __visc__createNodeND(0, Func1); + void *DestNode = __visc__createNodeND(0, Func2); - __hpvm__bindIn(SrcNode, 0, 0, 0); + __visc__bindIn(SrcNode, 0, 0, 0); - __hpvm__bindIn(DestNode, 0, 0, 0); - __hpvm__edge(SrcNode, DestNode, 1, 0, 1, 0); + __visc__bindIn(DestNode, 0, 0, 0); + __visc__edge(SrcNode, DestNode, 1, 0, 1, 0); - __hpvm__bindOut(SrcNode, 0, 0, 0); + __visc__bindOut(SrcNode, 0, 0, 0); } int main(void) { @@ -41,10 +41,10 @@ int main(void) { int Out = 0; struct Root RootArgs = {(int *)&In, (int *)&Out}; - __hpvm__init(); - void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)&RootArgs); - __hpvm__wait(PipeDFG); - __hpvm__cleanup(); + __visc__init(); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs); + __visc__wait(PipeDFG); + __visc__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/Makefile b/hpvm/test/unitTests/Makefile index 15580e9300..539ee5e8fb 100644 --- a/hpvm/test/unitTests/Makefile +++ b/hpvm/test/unitTests/Makefile @@ -2,8 +2,8 @@ PASSES := .PHONY: clean -LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc HOST:=gemm_opencl KERNELS:=matrixMul LLVM_CC:=$(LLVM_INSTALL)/bin/clang diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c index 173f6b3b16..cfd041a991 100644 --- a/hpvm/test/unitTests/MallocIntrinsic.c +++ b/hpvm/test/unitTests/MallocIntrinsic.c @@ -1,4 +1,4 @@ -#include "hpvm.h" +#include "visc.h" #include <stdlib.h> struct Root { @@ -7,12 +7,12 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(1, In, 1, Out); + __visc__hint(CPU_TARGET); + __visc__attributes(1, In, 1, Out); - Out = (int *)__hpvm__malloc(*In); + Out = (int *)__visc__malloc(*In); - __hpvm__return(1, Out); + __visc__return(1, Out); } int main(void) { @@ -26,12 +26,12 @@ int main(void) { RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - __hpvm__init(); + __visc__init(); - void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); - __hpvm__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __hpvm__cleanup(); + __visc__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c index 43ba0ef56c..2a9bf83402 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.c @@ -1,4 +1,4 @@ -#include "hpvm.h" +#include "visc.h" #include <stdlib.h> struct Root { @@ -7,9 +7,9 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(1, In, 1, Out); - __hpvm__return(1, Out); + __visc__hint(CPU_TARGET); + __visc__attributes(1, In, 1, Out); + __visc__return(1, Out); } int main(void) { @@ -23,12 +23,12 @@ int main(void) { RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - __hpvm__init(); + __visc__init(); - void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); - __hpvm__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __hpvm__cleanup(); + __visc__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c index c2deed9867..36fc02d22b 100644 --- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c +++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c @@ -1,4 +1,4 @@ -#include "hpvm.h" +#include "visc.h" #include <stdlib.h> struct Root { @@ -7,24 +7,24 @@ struct Root { }; void PipeRoot(int *In, int *Out) { - __hpvm__hint(CPU_TARGET); - __hpvm__attributes(1, In, 1, Out); - __hpvm__return(1, Out); + __visc__hint(CPU_TARGET); + __visc__attributes(1, In, 1, Out); + __visc__return(1, Out); } int main(void) { int In, Out; - __hpvm__init(); + __visc__init(); struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root)); RootArgs->input = (int *)&In; RootArgs->output = (int *)&Out; - void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs); - __hpvm__wait(PipeDFG); + void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs); + __visc__wait(PipeDFG); - __hpvm__cleanup(); + __visc__cleanup(); return 0; } diff --git a/hpvm/test/unitTests/temp/3level.ll b/hpvm/test/unitTests/temp/3level.ll index 2e3753f140..168e7b4232 100644 --- a/hpvm/test/unitTests/temp/3level.ll +++ b/hpvm/test/unitTests/temp/3level.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/3level.ll' @@ -13,31 +13,31 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { @@ -47,18 +47,18 @@ entry: %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 - call void @llvm.hpvm.init() + call void @llvm.visc.init() %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output1 = extractvalue %rtype %outputstruct, 0 %output2 = extractvalue %rtype %outputstruct, 1 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output1) #0 %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output2) #0 ret i32 0 @@ -83,21 +83,21 @@ define %rtype_internal @foo(i32 %id) { } define %rtype_internal @subNode(i32 %id) { - %foo_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*)) - call void @llvm.hpvm.bind.input(i8* %foo_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %foo_node, i32 0, i32 0) + %foo_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*)) + call void @llvm.visc.bind.input(i8* %foo_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %foo_node, i32 0, i32 0) ret %rtype_internal zeroinitializer } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*)) - %sub_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %sub_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %sub_node, i32 0, i32 1) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*)) + %sub_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %sub_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %sub_node, i32 0, i32 1) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/Makefile b/hpvm/test/unitTests/temp/Makefile index 15580e9300..539ee5e8fb 100644 --- a/hpvm/test/unitTests/temp/Makefile +++ b/hpvm/test/unitTests/temp/Makefile @@ -2,8 +2,8 @@ PASSES := .PHONY: clean -LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc HOST:=gemm_opencl KERNELS:=matrixMul LLVM_CC:=$(LLVM_INSTALL)/bin/clang diff --git a/hpvm/test/unitTests/temp/query2D.ll b/hpvm/test/unitTests/temp/query2D.ll index 48358a3527..c994c2a3ff 100644 --- a/hpvm/test/unitTests/temp/query2D.ll +++ b/hpvm/test/unitTests/temp/query2D.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/query2D.ll' @@ -12,46 +12,46 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 +declare i8* @llvm.visc.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0 +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getParentNode(i8*) #0 +declare i8* @llvm.visc.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { @@ -61,25 +61,25 @@ entry: %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 - call void @llvm.hpvm.init() + call void @llvm.visc.init() %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -92,11 +92,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.hpvm.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/query3D.ll b/hpvm/test/unitTests/temp/query3D.ll index d2ff16ef56..438fe60a3b 100644 --- a/hpvm/test/unitTests/temp/query3D.ll +++ b/hpvm/test/unitTests/temp/query3D.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/query3D.ll' @@ -12,57 +12,57 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 +declare i8* @llvm.visc.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0 +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode3D(i8*, i32, i32, i32) #0 +declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getParentNode(i8*) #0 +declare i8* @llvm.visc.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumNodeInstances.y(i8*) #0 +declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -71,21 +71,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %dim = call i32 @llvm.hpvm.getNumNodeInstances.y(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %dim = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -98,11 +98,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.hpvm.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNodeInst.ll b/hpvm/test/unitTests/temp/queryNodeInst.ll index 4e3dd75530..24d6a3f0d3 100644 --- a/hpvm/test/unitTests/temp/queryNodeInst.ll +++ b/hpvm/test/unitTests/temp/queryNodeInst.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,40 +12,40 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -54,21 +54,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -81,11 +81,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNumDim.ll b/hpvm/test/unitTests/temp/queryNumDim.ll index caa0978dab..500e2ff41b 100644 --- a/hpvm/test/unitTests/temp/queryNumDim.ll +++ b/hpvm/test/unitTests/temp/queryNumDim.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,42 +12,42 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getParentNode(i8*) #0 +declare i8* @llvm.visc.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -56,21 +56,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -83,11 +83,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/queryNumNodeInst.ll b/hpvm/test/unitTests/temp/queryNumNodeInst.ll index 07418ff725..48add92f16 100644 --- a/hpvm/test/unitTests/temp/queryNumNodeInst.ll +++ b/hpvm/test/unitTests/temp/queryNumNodeInst.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -12,48 +12,48 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0 +declare i8* @llvm.visc.createNode1D(i8*, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getParentNode(i8*) #0 +declare i8* @llvm.visc.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -62,21 +62,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) %sum2 = add i32 %sum, %dim %output = insertvalue %rtype undef, i32 %sum2, 0 ret %rtype %output @@ -89,11 +89,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %dimension) { - %p_node = call i8* @llvm.hpvm.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/singleNode.ll b/hpvm/test/unitTests/temp/singleNode.ll index 99e5318131..20713e955f 100644 --- a/hpvm/test/unitTests/temp/singleNode.ll +++ b/hpvm/test/unitTests/temp/singleNode.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll' @@ -12,43 +12,43 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0 %conv.i = trunc i64 %call.i to i32 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) - call void @llvm.hpvm.cleanup() + call void @llvm.visc.wait(i8* %graphID) + call void @llvm.visc.cleanup() ret i32 0 } @@ -59,8 +59,8 @@ define %rtype @foo() { } define %rtype @Root() { - %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo to i8*)) - call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo to i8*)) + call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/singleNodeStream.ll b/hpvm/test/unitTests/temp/singleNodeStream.ll index aa0243603c..fce75df671 100644 --- a/hpvm/test/unitTests/temp/singleNodeStream.ll +++ b/hpvm/test/unitTests/temp/singleNodeStream.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0 +declare i8* @llvm.visc.launch(i8*, i8*, i1) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.push(i8*, i8*) #0 +declare void @llvm.visc.push(i8*, i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.pop(i8*) #0 +declare i8* @llvm.visc.pop(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) +declare void @llvm.visc.bind.input(i8*, i32, i32, i1) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) +declare void @llvm.visc.bind.output(i8*, i32, i32, i1) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %num = alloca i32 %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -60,27 +60,27 @@ entry: %args = bitcast %struct.arg* %in.addr to i8* ; Launch the pipeline - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1) ; Push arguments into the pipeline - call void @llvm.hpvm.push(i8* %graphID, i8* %args) - call void @llvm.hpvm.push(i8* %graphID, i8* %args) - call void @llvm.hpvm.push(i8* %graphID, i8* %args) - call void @llvm.hpvm.push(i8* %graphID, i8* %args) + call void @llvm.visc.push(i8* %graphID, i8* %args) + call void @llvm.visc.push(i8* %graphID, i8* %args) + call void @llvm.visc.push(i8* %graphID, i8* %args) + call void @llvm.visc.push(i8* %graphID, i8* %args) ; Pop out arguments and read the output - %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID) - %graph_output1 = call i8* @llvm.hpvm.pop(i8* %graphID) - %graph_output2 = call i8* @llvm.hpvm.pop(i8* %graphID) - %graph_output3 = call i8* @llvm.hpvm.pop(i8* %graphID) + %graph_output = call i8* @llvm.visc.pop(i8* %graphID) + %graph_output1 = call i8* @llvm.visc.pop(i8* %graphID) + %graph_output2 = call i8* @llvm.visc.pop(i8* %graphID) + %graph_output3 = call i8* @llvm.visc.pop(i8* %graphID) %output.addr = bitcast i8* %graph_output to %rptype* %outputstruct = load %rptype* %output.addr %output = extractvalue %rptype %outputstruct, 0 %output_val = load i32* %output %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0 - call void @llvm.hpvm.wait(i8* %graphID) - call void @llvm.hpvm.cleanup() + call void @llvm.visc.wait(i8* %graphID) + call void @llvm.visc.cleanup() ret i32 0 } @@ -95,11 +95,11 @@ define %rptype @producer(i32* %id, i64 %size) { } define %rptype @Root(i32* %id, i64 %size) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 1) - call void @llvm.hpvm.bind.output(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.hpvm.bind.output(i8* %p_node, i32 1, i32 1, i1 1) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 1) + call void @llvm.visc.bind.output(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.visc.bind.output(i8* %p_node, i32 1, i32 1, i1 1) ret %rptype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoLaunch.ll b/hpvm/test/unitTests/temp/twoLaunch.ll index ee602f58d8..48c973a7e6 100644 --- a/hpvm/test/unitTests/temp/twoLaunch.ll +++ b/hpvm/test/unitTests/temp/twoLaunch.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll' @@ -12,33 +12,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr_1 = alloca %struct.arg %in.addr_2= alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -47,12 +47,12 @@ entry: %conv.i = trunc i64 %call.i to i32 %args_1 = bitcast %struct.arg* %in.addr_1 to i8* %args_2 = bitcast %struct.arg* %in.addr_2 to i8* - %graphID_1 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1) - %graphID_2 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2) + %graphID_1 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1) + %graphID_2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID_1) - call void @llvm.hpvm.wait(i8* %graphID_2) - call void @llvm.hpvm.cleanup() + call void @llvm.visc.wait(i8* %graphID_1) + call void @llvm.visc.wait(i8* %graphID_2) + call void @llvm.visc.cleanup() ret i32 0 } @@ -70,14 +70,14 @@ define %rtype @foo_2() { } define %rtype @Root_1() { - %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*)) - call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*)) + call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } define %rtype @Root_2() { - %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*)) - call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0) + %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*)) + call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNode.ll b/hpvm/test/unitTests/temp/twoNode.ll index 74e4c64d59..5e2899830b 100644 --- a/hpvm/test/unitTests/temp/twoNode.ll +++ b/hpvm/test/unitTests/temp/twoNode.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll' @@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -46,10 +46,10 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) - call void @llvm.hpvm.cleanup() + call void @llvm.visc.wait(i8* %graphID) + call void @llvm.visc.cleanup() ret i32 0 } @@ -66,10 +66,10 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeConnect.ll b/hpvm/test/unitTests/temp/twoNodeConnect.ll index 6b23ad691b..06652b94e0 100644 --- a/hpvm/test/unitTests/temp/twoNodeConnect.ll +++ b/hpvm/test/unitTests/temp/twoNodeConnect.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -46,14 +46,14 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } @@ -70,11 +70,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeQuery.ll b/hpvm/test/unitTests/temp/twoNodeQuery.ll index 247d1830da..2e1ea0dba4 100644 --- a/hpvm/test/unitTests/temp/twoNodeQuery.ll +++ b/hpvm/test/unitTests/temp/twoNodeQuery.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeQuery.ll' @@ -11,42 +11,42 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32) +declare void @llvm.visc.bind.input(i8*, i32, i32) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32) +declare void @llvm.visc.bind.output(i8*, i32, i32) ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getNode() #0 +declare i8* @llvm.visc.getNode() #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.getParentNode(i8*) #0 +declare i8* @llvm.visc.getParentNode(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.hpvm.getNumDims(i8*) #0 +declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %arrayidx = getelementptr inbounds i8** %argv, i64 1 %0 = load i8** %arrayidx, align 8, !tbaa !0 @@ -55,21 +55,21 @@ entry: %1 = bitcast %struct.arg* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - call void @llvm.hpvm.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output = extractvalue %rtype %outputstruct, 0 %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0 - call void @llvm.hpvm.cleanup() + call void @llvm.visc.cleanup() ret i32 0 } define %rtype @producer(i32 %id) { %sum = add i32 4, %id - %this_node = call i8* @llvm.hpvm.getNode() - %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node) + %this_node = call i8* @llvm.visc.getNode() + %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node) %sum2 = add i32 %sum, %numDim %output = insertvalue %rtype undef, i32 %sum, 0 ret %rtype %output @@ -82,11 +82,11 @@ define %rtype @consumer(i32 %id) { } define %rtype @Root(i32 %id) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0) ret %rtype zeroinitializer } diff --git a/hpvm/test/unitTests/temp/twoNodeStream.ll b/hpvm/test/unitTests/temp/twoNodeStream.ll index f9820abd19..6e99259518 100644 --- a/hpvm/test/unitTests/temp/twoNodeStream.ll +++ b/hpvm/test/unitTests/temp/twoNodeStream.ll @@ -1,5 +1,5 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin ; RUN: %t.bin 5 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll' @@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 ; Function Attrs: nounwind -declare void @llvm.hpvm.init() #1 +declare void @llvm.visc.init() #1 ; Function Attrs: nounwind -declare void @llvm.hpvm.cleanup() #1 +declare void @llvm.visc.cleanup() #1 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createNode(i8*) #0 +declare i8* @llvm.visc.createNode(i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0 +declare i8* @llvm.visc.launch(i8*, i8*, i1) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.push(i8*, i8*) #0 +declare void @llvm.visc.push(i8*, i8*) #0 ; Function Attrs: nounwind -declare i8* @llvm.hpvm.pop(i8*) #0 +declare i8* @llvm.visc.pop(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) +declare void @llvm.visc.bind.input(i8*, i32, i32, i1) ; Function Attrs: nounwind -declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) +declare void @llvm.visc.bind.output(i8*, i32, i32, i1) ; Function Attrs: nounwind uwtable define i32 @main(i32 %argc, i8** nocapture %argv) #1 { entry: - call void @llvm.hpvm.init() + call void @llvm.visc.init() %in.addr = alloca %struct.arg %num = alloca i32 %arrayidx = getelementptr inbounds i8** %argv, i64 1 @@ -60,21 +60,21 @@ entry: %args = bitcast %struct.arg* %in.addr to i8* ; Launch the pipeline - %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1) ; Push arguments into the pipeline - call void @llvm.hpvm.push(i8* %graphID, i8* %args) + call void @llvm.visc.push(i8* %graphID, i8* %args) ; Pop out arguments and read the output - %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID) + %graph_output = call i8* @llvm.visc.pop(i8* %graphID) %output.addr = bitcast i8* %graph_output to %rctype* %outputstruct = load %rctype* %output.addr %output = extractvalue %rctype %outputstruct, 0 %output_val = load i32* %output %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0 - call void @llvm.hpvm.wait(i8* %graphID) - call void @llvm.hpvm.cleanup() + call void @llvm.visc.wait(i8* %graphID) + call void @llvm.visc.cleanup() ret i32 0 } @@ -97,14 +97,14 @@ define %rctype @consumer(i32* %id, i64 %size) { } define %rctype @Root(i32* %id, i64 %size) { - %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) - %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*)) - %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1) - %edge2 = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1) - call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 0) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0, i1 1) - call void @llvm.hpvm.bind.output(i8* %c_node, i32 1, i32 1, i1 1) + %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*)) + %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*)) + %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1) + %edge2 = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1) + call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1) + call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 0) + call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0, i1 1) + call void @llvm.visc.bind.output(i8* %c_node, i32 1, i32 1, i1 1) ret %rctype zeroinitializer } -- GitLab