From 654a6b2ef1223676a4abcbabaca4ec426d2dab7f Mon Sep 17 00:00:00 2001
From: Ben Schreiber <bjschre2@illinois.edu>
Date: Tue, 21 Jan 2020 00:26:29 -0600
Subject: [PATCH] Remove unused parboil benchmarks

---
 .../test/parboil/benchmarks/histo/DESCRIPTION |    3 -
 hpvm/test/parboil/benchmarks/histo/Makefile   |   38 -
 .../parboil/benchmarks/histo/histo.host.ll    | 1362 ------------
 .../parboil/benchmarks/histo/histo.nvptx.s    | 1846 -----------------
 .../benchmarks/histo/src/base/Makefile        |    3 -
 .../parboil/benchmarks/histo/src/base/bmp.h   |   96 -
 .../parboil/benchmarks/histo/src/base/main.c  |  123 --
 .../parboil/benchmarks/histo/src/base/util.c  |   90 -
 .../parboil/benchmarks/histo/src/base/util.h  |    9 -
 .../benchmarks/histo/src/cuda/Makefile        |    5 -
 .../parboil/benchmarks/histo/src/cuda/bmp.h   |   96 -
 .../benchmarks/histo/src/cuda/histo_final.cu  |  108 -
 .../histo/src/cuda/histo_intermediates.cu     |   66 -
 .../benchmarks/histo/src/cuda/histo_main.cu   |  189 --
 .../histo/src/cuda/histo_prescan.cu           |   85 -
 .../parboil/benchmarks/histo/src/cuda/main.cu |  232 ---
 .../parboil/benchmarks/histo/src/cuda/util.cu |   90 -
 .../parboil/benchmarks/histo/src/cuda/util.h  |   28 -
 .../benchmarks/histo/src/cuda_base/Makefile   |    5 -
 .../benchmarks/histo/src/cuda_base/bmp.h      |   88 -
 .../histo/src/cuda_base/histo_final.cu        |   95 -
 .../src/cuda_base/histo_intermediates.cu      |   58 -
 .../histo/src/cuda_base/histo_main.cu         |  184 --
 .../histo/src/cuda_base/histo_prescan.cu      |   77 -
 .../benchmarks/histo/src/cuda_base/main.cu    |  260 ---
 .../benchmarks/histo/src/cuda_base/util.cu    |   82 -
 .../benchmarks/histo/src/cuda_base/util.h     |   20 -
 .../benchmarks/histo/src/omp_base/Makefile    |    5 -
 .../benchmarks/histo/src/omp_base/bmp.h       |   96 -
 .../benchmarks/histo/src/omp_base/main.c      |  127 --
 .../benchmarks/histo/src/omp_base/util.c      |   90 -
 .../benchmarks/histo/src/omp_base/util.h      |    9 -
 .../benchmarks/histo/src/opencl_base/Makefile |    4 -
 .../histo/src/opencl_base/OpenCL_common.cpp   |  299 ---
 .../histo/src/opencl_base/OpenCL_common.h     |   22 -
 .../benchmarks/histo/src/opencl_base/bmp.h    |   96 -
 .../histo/src/opencl_base/histo_final.cl      |  103 -
 .../src/opencl_base/histo_intermediates.cl    |  189 --
 .../histo/src/opencl_base/histo_main.cl       |  193 --
 .../histo/src/opencl_base/histo_prescan.cl    |   85 -
 .../benchmarks/histo/src/opencl_base/main.cpp |  458 ----
 .../benchmarks/histo/src/opencl_base/util.cpp |   90 -
 .../benchmarks/histo/src/opencl_base/util.h   |   17 -
 .../histo/src/opencl_cpu_baseline/Makefile    |    4 -
 .../src/opencl_cpu_baseline/OpenCL_common.cpp |  299 ---
 .../src/opencl_cpu_baseline/OpenCL_common.h   |   22 -
 .../histo/src/opencl_cpu_baseline/bmp.h       |   96 -
 .../src/opencl_cpu_baseline/histo_final.cl    |  108 -
 .../histo_intermediates.cl                    |   63 -
 .../src/opencl_cpu_baseline/histo_main.cl     |  189 --
 .../src/opencl_cpu_baseline/histo_prescan.cl  |   85 -
 .../histo/src/opencl_cpu_baseline/kernel.cl   |  468 -----
 .../src/opencl_cpu_baseline/kernel_x64.ll     |  884 --------
 .../src/opencl_cpu_baseline/kernel_x64.spir   |  Bin 8576 -> 0 bytes
 .../histo/src/opencl_cpu_baseline/main.cpp    |  486 -----
 .../histo/src/opencl_cpu_baseline/util.cpp    |   90 -
 .../histo/src/opencl_cpu_baseline/util.h      |   17 -
 .../histo/src/opencl_nvidia/Makefile          |    4 -
 .../histo/src/opencl_nvidia/OpenCL_common.cpp |  299 ---
 .../histo/src/opencl_nvidia/OpenCL_common.h   |   22 -
 .../benchmarks/histo/src/opencl_nvidia/bmp.h  |   96 -
 .../histo/src/opencl_nvidia/histo_final.cl    |  108 -
 .../src/opencl_nvidia/histo_intermediates.cl  |   63 -
 .../histo/src/opencl_nvidia/histo_main.cl     |  189 --
 .../histo/src/opencl_nvidia/histo_prescan.cl  |   85 -
 .../histo/src/opencl_nvidia/kernel.cl         |  456 ----
 .../histo/src/opencl_nvidia/main.cpp          |  480 -----
 .../histo/src/opencl_nvidia/util.cpp          |   90 -
 .../benchmarks/histo/src/opencl_nvidia/util.h |   17 -
 .../benchmarks/histo/src/threaded/FauxBlock.c |  271 ---
 .../benchmarks/histo/src/threaded/FauxBlock.h | 1233 -----------
 .../benchmarks/histo/src/threaded/Makefile    |    5 -
 .../benchmarks/histo/src/threaded/bmp.h       |   96 -
 .../benchmarks/histo/src/threaded/main.c      |  252 ---
 .../benchmarks/histo/src/threaded/util.c      |   90 -
 .../benchmarks/histo/src/threaded/util.h      |   16 -
 .../benchmarks/histo/src/visc/Makefile        |    8 -
 .../parboil/benchmarks/histo/src/visc/bmp.h   |   96 -
 .../benchmarks/histo/src/visc/main.cpp        | 1275 ------------
 .../benchmarks/histo/src/visc/util.cpp        |   90 -
 .../parboil/benchmarks/histo/src/visc/util.h  |   17 -
 .../benchmarks/histo/src/visc_one/main.cpp    | 1272 ------------
 .../benchmarks/histo/tools/compare-output     |   13 -
 hpvm/test/parboil/benchmarks/kmeans/Makefile  |   42 -
 .../benchmarks/kmeans/src/opencl/Makefile     |    7 -
 .../benchmarks/kmeans/src/opencl/README       |    9 -
 .../benchmarks/kmeans/src/opencl/cluster.c    |  156 --
 .../benchmarks/kmeans/src/opencl/getopt.c     | 1184 -----------
 .../benchmarks/kmeans/src/opencl/getopt.h     |  191 --
 .../benchmarks/kmeans/src/opencl/kmeans.cl    |   56 -
 .../benchmarks/kmeans/src/opencl/kmeans.cpp   |  357 ----
 .../benchmarks/kmeans/src/opencl/kmeans.h     |   64 -
 .../kmeans/src/opencl/kmeans_clustering.c     |  177 --
 .../benchmarks/kmeans/src/opencl/read_input.c |  329 ---
 .../benchmarks/kmeans/src/opencl/rmse.c       |   95 -
 .../parboil/benchmarks/kmeans/src/opencl/run  |    1 -
 .../benchmarks/kmeans/src/opencl/unistd.h     |  945 ---------
 .../benchmarks/kmeans/src/visc/Makefile       |    8 -
 .../parboil/benchmarks/kmeans/src/visc/README |    9 -
 .../benchmarks/kmeans/src/visc/cluster.c      |  157 --
 .../benchmarks/kmeans/src/visc/getopt.c       | 1184 -----------
 .../benchmarks/kmeans/src/visc/getopt.h       |  191 --
 .../benchmarks/kmeans/src/visc/kmeans.cl      |   56 -
 .../benchmarks/kmeans/src/visc/kmeans.cpp     |  239 ---
 .../benchmarks/kmeans/src/visc/kmeans.h       |   64 -
 .../kmeans/src/visc/kmeans_clustering.c       |  181 --
 .../benchmarks/kmeans/src/visc/read_input.c   |  332 ---
 .../parboil/benchmarks/kmeans/src/visc/rmse.c |   95 -
 .../parboil/benchmarks/kmeans/src/visc/run    |    1 -
 .../benchmarks/kmeans/src/visc/unistd.h       |  945 ---------
 .../benchmarks/kmeans/tools/compare-output    |   65 -
 .../parboil/benchmarks/linear-svm/Makefile    |   33 -
 .../benchmarks/linear-svm/linear-svm.visc.ll  |  501 -----
 .../linear-svm/src/visc_cm/Makefile           |    9 -
 .../benchmarks/linear-svm/src/visc_cm/io.cc   |   91 -
 .../benchmarks/linear-svm/src/visc_cm/main.cc |  187 --
 hpvm/test/parboil/benchmarks/llvm-40-34.py    |   37 -
 .../benchmarks/merge-tests/DESCRIPTION        |   10 -
 .../parboil/benchmarks/merge-tests/Makefile   |   34 -
 .../merge-tests/src/2DLeaf/Makefile           |    9 -
 .../benchmarks/merge-tests/src/2DLeaf/io.cc   |   91 -
 .../benchmarks/merge-tests/src/2DLeaf/main.cc |  286 ---
 .../merge-tests/src/2ILeaf/Makefile           |    9 -
 .../benchmarks/merge-tests/src/2ILeaf/io.cc   |   91 -
 .../benchmarks/merge-tests/src/2ILeaf/main.cc |  284 ---
 .../merge-tests/src/2ILeafD/Makefile          |    9 -
 .../benchmarks/merge-tests/src/2ILeafD/io.cc  |   91 -
 .../merge-tests/src/2ILeafD/main.cc           |  296 ---
 .../merge-tests/src/2ILeafS/Makefile          |    9 -
 .../benchmarks/merge-tests/src/2ILeafS/io.cc  |   91 -
 .../merge-tests/src/2ILeafS/main.cc           |  296 ---
 .../merge-tests/src/2LevelACAC/Makefile       |    9 -
 .../merge-tests/src/2LevelACAC/io.cc          |   91 -
 .../merge-tests/src/2LevelACAC/main.cc        |  400 ----
 .../merge-tests/src/2LevelACC/Makefile        |    9 -
 .../merge-tests/src/2LevelACC/io.cc           |   91 -
 .../merge-tests/src/2LevelACC/main.cc         |  378 ----
 .../merge-tests/src/2LevelCAC/Makefile        |    9 -
 .../merge-tests/src/2LevelCAC/io.cc           |   91 -
 .../merge-tests/src/2LevelCAC/main.cc         |  378 ----
 .../merge-tests/src/2LevelCC/Makefile         |    9 -
 .../benchmarks/merge-tests/src/2LevelCC/io.cc |   91 -
 .../merge-tests/src/2LevelCC/main.cc          |  381 ----
 .../merge-tests/src/2LevelICC/Makefile        |    9 -
 .../merge-tests/src/2LevelICC/io.cc           |   91 -
 .../merge-tests/src/2LevelICC/main.cc         |  373 ----
 .../merge-tests/tools/compare-output          |   42 -
 .../benchmarks/mri-gridding/DESCRIPTION       |    4 -
 .../mri-gridding/src/base/CPU_kernels.c       |  205 --
 .../mri-gridding/src/base/CPU_kernels.h       |   23 -
 .../benchmarks/mri-gridding/src/base/Makefile |    4 -
 .../mri-gridding/src/base/UDTypes.h           |   38 -
 .../benchmarks/mri-gridding/src/base/main.c   |  194 --
 .../mri-gridding/src/cuda-base/CPU_kernels.c  |  205 --
 .../mri-gridding/src/cuda-base/CPU_kernels.h  |   25 -
 .../src/cuda-base/CUDA_interface.cu           |  268 ---
 .../src/cuda-base/CUDA_interface.h            |   20 -
 .../mri-gridding/src/cuda-base/GPU_kernels.cu |  164 --
 .../mri-gridding/src/cuda-base/Makefile       |    7 -
 .../mri-gridding/src/cuda-base/UDTypes.h      |   38 -
 .../mri-gridding/src/cuda-base/main.cu        |  245 ---
 .../src/cuda-base/scanLargeArray.cu           |  267 ---
 .../src/cuda-base/scanLargeArray.h            |    9 -
 .../mri-gridding/src/cuda-base/sort.cu        |  254 ---
 .../mri-gridding/src/cuda-base/sort.h         |    9 -
 .../mri-gridding/src/cuda/CPU_kernels.c       |  353 ----
 .../mri-gridding/src/cuda/CPU_kernels.h       |   25 -
 .../mri-gridding/src/cuda/CUDA_interface.cu   |  308 ---
 .../mri-gridding/src/cuda/CUDA_interface.h    |   20 -
 .../mri-gridding/src/cuda/GPU_kernels.cu      |  252 ---
 .../benchmarks/mri-gridding/src/cuda/Makefile |    7 -
 .../mri-gridding/src/cuda/UDTypes.h           |   38 -
 .../benchmarks/mri-gridding/src/cuda/main.cu  |  246 ---
 .../mri-gridding/src/cuda/scanLargeArray.cu   |  267 ---
 .../mri-gridding/src/cuda/scanLargeArray.h    |    9 -
 .../benchmarks/mri-gridding/src/cuda/sort.cu  |  254 ---
 .../benchmarks/mri-gridding/src/cuda/sort.h   |    9 -
 .../mri-gridding/src/omp_base/CPU_kernels.c   |  214 --
 .../mri-gridding/src/omp_base/CPU_kernels.h   |   23 -
 .../mri-gridding/src/omp_base/Makefile        |    6 -
 .../mri-gridding/src/omp_base/UDTypes.h       |   38 -
 .../mri-gridding/src/omp_base/main.c          |  194 --
 .../src/opencl_base/CPU_kernels.c             |  205 --
 .../src/opencl_base/CPU_kernels.h             |   25 -
 .../src/opencl_base/GPU_kernels.cl            |  176 --
 .../mri-gridding/src/opencl_base/Makefile     |    5 -
 .../src/opencl_base/OpenCL_common.cpp         |  294 ---
 .../src/opencl_base/OpenCL_common.h           |   26 -
 .../src/opencl_base/OpenCL_interface.cpp      |  345 ---
 .../src/opencl_base/OpenCL_interface.h        |   26 -
 .../mri-gridding/src/opencl_base/UDTypes.h    |   38 -
 .../mri-gridding/src/opencl_base/main.cpp     |  352 ----
 .../src/opencl_base/scanLargeArray.cl         |  198 --
 .../src/opencl_base/scanLargeArray.cpp        |  184 --
 .../src/opencl_base/scanLargeArray.h          |   11 -
 .../mri-gridding/src/opencl_base/sort.cl      |  227 --
 .../mri-gridding/src/opencl_base/sort.cpp     |  150 --
 .../mri-gridding/src/opencl_base/sort.h       |   11 -
 .../src/opencl_nvidia/CPU_kernels.c           |  353 ----
 .../src/opencl_nvidia/CPU_kernels.h           |   25 -
 .../src/opencl_nvidia/GPU_kernels.cl          |  264 ---
 .../mri-gridding/src/opencl_nvidia/Makefile   |    5 -
 .../src/opencl_nvidia/OpenCL_common.cpp       |  294 ---
 .../src/opencl_nvidia/OpenCL_common.h         |   26 -
 .../src/opencl_nvidia/OpenCL_interface.cpp    |  382 ----
 .../src/opencl_nvidia/OpenCL_interface.h      |   26 -
 .../mri-gridding/src/opencl_nvidia/UDTypes.h  |   38 -
 .../mri-gridding/src/opencl_nvidia/main.cpp   |  355 ----
 .../src/opencl_nvidia/scanLargeArray.cl       |  198 --
 .../src/opencl_nvidia/scanLargeArray.cpp      |  185 --
 .../src/opencl_nvidia/scanLargeArray.h        |   11 -
 .../mri-gridding/src/opencl_nvidia/sort.cl    |  225 --
 .../mri-gridding/src/opencl_nvidia/sort.cpp   |  149 --
 .../mri-gridding/src/opencl_nvidia/sort.h     |   11 -
 .../mri-gridding/src/visc/CPU_kernels.c       |  353 ----
 .../mri-gridding/src/visc/CPU_kernels.h       |   25 -
 .../mri-gridding/src/visc/GPU_kernels.cl      |  264 ---
 .../benchmarks/mri-gridding/src/visc/Makefile |    5 -
 .../mri-gridding/src/visc/OpenCL_common.cpp   |  294 ---
 .../mri-gridding/src/visc/OpenCL_common.h     |   26 -
 .../src/visc/OpenCL_interface.cpp             |  382 ----
 .../mri-gridding/src/visc/OpenCL_interface.h  |   26 -
 .../mri-gridding/src/visc/UDTypes.h           |   38 -
 .../benchmarks/mri-gridding/src/visc/main.cpp |  355 ----
 .../mri-gridding/src/visc/scanLargeArray.cl   |  198 --
 .../mri-gridding/src/visc/scanLargeArray.cpp  |  185 --
 .../mri-gridding/src/visc/scanLargeArray.h    |   11 -
 .../benchmarks/mri-gridding/src/visc/sort.cl  |  225 --
 .../benchmarks/mri-gridding/src/visc/sort.cpp |  149 --
 .../benchmarks/mri-gridding/src/visc/sort.h   |   11 -
 .../mri-gridding/tools/compare-output         |   71 -
 .../test/parboil/benchmarks/mri-q/DESCRIPTION |    5 -
 hpvm/test/parboil/benchmarks/mri-q/Makefile   |   35 -
 .../benchmarks/mri-q/recycle/base/Makefile    |    8 -
 .../benchmarks/mri-q/recycle/base/computeQ.cc |   69 -
 .../benchmarks/mri-q/recycle/base/file.cc     |   76 -
 .../benchmarks/mri-q/recycle/base/file.h      |   22 -
 .../benchmarks/mri-q/recycle/base/main.c      |  135 --
 .../parboil/benchmarks/mri-q/src/cpu/Makefile |    7 -
 .../benchmarks/mri-q/src/cpu/computeQ.cc      |   69 -
 .../parboil/benchmarks/mri-q/src/cpu/file.cc  |   76 -
 .../parboil/benchmarks/mri-q/src/cpu/file.h   |   22 -
 .../parboil/benchmarks/mri-q/src/cpu/main.c   |  135 --
 .../benchmarks/mri-q/src/cuda/Makefile        |    7 -
 .../benchmarks/mri-q/src/cuda/computeQ.cu     |  145 --
 .../parboil/benchmarks/mri-q/src/cuda/file.cc |   76 -
 .../parboil/benchmarks/mri-q/src/cuda/file.h  |   22 -
 .../parboil/benchmarks/mri-q/src/cuda/main.cu |  212 --
 .../benchmarks/mri-q/src/omp_base/Makefile    |    7 -
 .../benchmarks/mri-q/src/omp_base/computeQ.cc |   71 -
 .../benchmarks/mri-q/src/omp_base/file.cc     |   76 -
 .../benchmarks/mri-q/src/omp_base/file.h      |   22 -
 .../benchmarks/mri-q/src/omp_base/main.c      |  136 --
 .../benchmarks/mri-q/src/opencl/Makefile      |    7 -
 .../benchmarks/mri-q/src/opencl/computeQ.c    |  118 --
 .../benchmarks/mri-q/src/opencl/computeQ.h    |   14 -
 .../benchmarks/mri-q/src/opencl/file.cc       |   78 -
 .../benchmarks/mri-q/src/opencl/file.h        |   22 -
 .../benchmarks/mri-q/src/opencl/kernels.cl    |   69 -
 .../benchmarks/mri-q/src/opencl/macros.h      |   21 -
 .../benchmarks/mri-q/src/opencl/main.c        |  285 ---
 .../parboil/benchmarks/mri-q/src/opencl/ocl.c |   50 -
 .../parboil/benchmarks/mri-q/src/opencl/ocl.h |   21 -
 .../mri-q/src/opencl_nvidia/Makefile          |    7 -
 .../mri-q/src/opencl_nvidia/computeQ.c        |   88 -
 .../mri-q/src/opencl_nvidia/computeQ.h        |   14 -
 .../mri-q/src/opencl_nvidia/file.cc           |   78 -
 .../benchmarks/mri-q/src/opencl_nvidia/file.h |   22 -
 .../mri-q/src/opencl_nvidia/kernels.cl        |   64 -
 .../mri-q/src/opencl_nvidia/macros.h          |   21 -
 .../benchmarks/mri-q/src/opencl_nvidia/main.c |  300 ---
 .../benchmarks/mri-q/src/opencl_nvidia/ocl.c  |   49 -
 .../benchmarks/mri-q/src/opencl_nvidia/ocl.h  |   21 -
 .../benchmarks/mri-q/src/visc/Makefile        |    7 -
 .../benchmarks/mri-q/src/visc/macros.h        |   21 -
 .../benchmarks/mri-q/tools/compare-output     |   46 -
 .../test/parboil/benchmarks/nodeSwap/Makefile |   39 -
 .../benchmarks/nodeSwap/src/opencl/Makefile   |    8 -
 .../benchmarks/nodeSwap/src/opencl/io.cc      |   91 -
 .../benchmarks/nodeSwap/src/opencl/kernel.cl  |   19 -
 .../nodeSwap/src/opencl/kernel_offline.cl     |   32 -
 .../benchmarks/nodeSwap/src/opencl/main.cc    |  212 --
 .../benchmarks/nodeSwap/tools/compare-output  |   41 -
 hpvm/test/parboil/benchmarks/sad/DESCRIPTION  |    3 -
 hpvm/test/parboil/benchmarks/sad/Makefile     |   37 -
 .../parboil/benchmarks/sad/src/base/Makefile  |    4 -
 .../parboil/benchmarks/sad/src/base/file.c    |   55 -
 .../parboil/benchmarks/sad/src/base/file.h    |   22 -
 .../parboil/benchmarks/sad/src/base/image.c   |   56 -
 .../parboil/benchmarks/sad/src/base/image.h   |   25 -
 .../parboil/benchmarks/sad/src/base/main.c    |  318 ---
 .../parboil/benchmarks/sad/src/base/sad.h     |   83 -
 .../parboil/benchmarks/sad/src/base/sad_cpu.c |  214 --
 .../parboil/benchmarks/sad/src/cpu/Makefile   |    4 -
 .../parboil/benchmarks/sad/src/cpu/file.c     |   55 -
 .../parboil/benchmarks/sad/src/cpu/file.h     |   22 -
 .../parboil/benchmarks/sad/src/cpu/image.c    |   56 -
 .../parboil/benchmarks/sad/src/cpu/image.h    |   25 -
 .../parboil/benchmarks/sad/src/cpu/main.c     |  318 ---
 .../test/parboil/benchmarks/sad/src/cpu/sad.h |   83 -
 .../parboil/benchmarks/sad/src/cpu/sad_cpu.c  |  294 ---
 .../parboil/benchmarks/sad/src/cuda/Makefile  |    4 -
 .../parboil/benchmarks/sad/src/cuda/file.c    |   55 -
 .../parboil/benchmarks/sad/src/cuda/file.h    |   22 -
 .../parboil/benchmarks/sad/src/cuda/image.c   |   56 -
 .../parboil/benchmarks/sad/src/cuda/image.h   |   25 -
 .../benchmarks/sad/src/cuda/largerBlocks.cu   |  125 --
 .../benchmarks/sad/src/cuda/largerBlocks.h    |   10 -
 .../parboil/benchmarks/sad/src/cuda/main.cu   |  406 ----
 .../parboil/benchmarks/sad/src/cuda/sad.h     |   83 -
 .../parboil/benchmarks/sad/src/cuda/sad4.cu   |  223 --
 .../parboil/benchmarks/sad/src/cuda/sad4.h    |   52 -
 .../benchmarks/sad/src/cuda_base/Makefile     |    4 -
 .../benchmarks/sad/src/cuda_base/file.c       |   55 -
 .../benchmarks/sad/src/cuda_base/file.h       |   22 -
 .../benchmarks/sad/src/cuda_base/image.c      |   56 -
 .../benchmarks/sad/src/cuda_base/image.h      |   25 -
 .../sad/src/cuda_base/largerBlocks.cu         |  139 --
 .../sad/src/cuda_base/largerBlocks.h          |   10 -
 .../benchmarks/sad/src/cuda_base/main.cu      |  406 ----
 .../benchmarks/sad/src/cuda_base/sad.h        |   83 -
 .../benchmarks/sad/src/cuda_base/sad4.cu      |   99 -
 .../benchmarks/sad/src/cuda_base/sad4.h       |   52 -
 .../benchmarks/sad/src/opencl_base/Makefile   |    4 -
 .../sad/src/opencl_base/OpenCL_common.cpp     |  296 ---
 .../sad/src/opencl_base/OpenCL_common.h       |   22 -
 .../benchmarks/sad/src/opencl_base/file.c     |   55 -
 .../benchmarks/sad/src/opencl_base/file.h     |   22 -
 .../benchmarks/sad/src/opencl_base/image.c    |   56 -
 .../benchmarks/sad/src/opencl_base/image.h    |   25 -
 .../benchmarks/sad/src/opencl_base/main.cpp   |  519 -----
 .../benchmarks/sad/src/opencl_base/sad.h      |   83 -
 .../sad/src/opencl_base/sad_kernel.cl         |  333 ---
 .../sad/src/opencl_base/sad_kernel.h          |   57 -
 .../benchmarks/sad/src/opencl_nvidia/Makefile |    4 -
 .../sad/src/opencl_nvidia/OpenCL_common.cpp   |  209 --
 .../sad/src/opencl_nvidia/OpenCL_common.h     |   21 -
 .../benchmarks/sad/src/opencl_nvidia/file.c   |   55 -
 .../benchmarks/sad/src/opencl_nvidia/file.h   |   22 -
 .../benchmarks/sad/src/opencl_nvidia/image.c  |   56 -
 .../benchmarks/sad/src/opencl_nvidia/image.h  |   25 -
 .../benchmarks/sad/src/opencl_nvidia/main.cpp |  517 -----
 .../sad/src/opencl_nvidia/main_debug.cpp      |  695 -------
 .../benchmarks/sad/src/opencl_nvidia/sad.h    |   83 -
 .../sad/src/opencl_nvidia/sad_kernel.cl       |  372 ----
 .../sad/src/opencl_nvidia/sad_kernel.h        |   57 -
 .../parboil/benchmarks/sad/src/visc/Makefile  |    4 -
 .../parboil/benchmarks/sad/src/visc/file.c    |   55 -
 .../parboil/benchmarks/sad/src/visc/file.h    |   22 -
 .../parboil/benchmarks/sad/src/visc/image.c   |   56 -
 .../parboil/benchmarks/sad/src/visc/image.h   |   25 -
 .../parboil/benchmarks/sad/src/visc/main.cpp  |  436 ----
 .../parboil/benchmarks/sad/src/visc/sad.h     |   83 -
 .../benchmarks/sad/src/visc/sad_kernel.cl     |  339 ---
 .../benchmarks/sad/src/visc/sad_kernel.h      |   57 -
 .../benchmarks/sad/tools/compare-output       |   38 -
 .../benchmarks/sad/tools/compute-one-sad.py   |   45 -
 .../parboil/benchmarks/saxpy_test/Makefile    |  177 --
 .../parboil/benchmarks/saxpy_test/src/defs.h  |  224 --
 .../parboil/benchmarks/saxpy_test/src/main.c  |  135 --
 .../parboil/benchmarks/saxpy_test/src/visc.h  |  110 -
 361 files changed, 52789 deletions(-)
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/DESCRIPTION
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/histo.host.ll
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/histo.nvptx.s
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/base/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/base/util.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/base/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/histo_final.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/histo_intermediates.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/histo_main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/histo_prescan.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/util.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_final.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_intermediates.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_prescan.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/omp_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/omp_base/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/omp_base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/omp_base/util.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/omp_base/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_final.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_intermediates.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_main.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_prescan.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_final.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_intermediates.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_main.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_prescan.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.ll
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.spir
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_final.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_intermediates.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_main.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_prescan.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/util.c
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/threaded/util.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/visc/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/visc/bmp.h
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/visc/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/visc/util.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/histo/src/visc/util.h
 delete mode 100755 hpvm/test/parboil/benchmarks/histo/src/visc_one/main.cpp
 delete mode 100755 hpvm/test/parboil/benchmarks/histo/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/Makefile
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/opencl/Makefile
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/opencl/README
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/cluster.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.h
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.h
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans_clustering.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/read_input.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/rmse.c
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/opencl/run
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/opencl/unistd.h
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/visc/Makefile
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/visc/README
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/cluster.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.h
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.h
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans_clustering.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/read_input.c
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/rmse.c
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/src/visc/run
 delete mode 100644 hpvm/test/parboil/benchmarks/kmeans/src/visc/unistd.h
 delete mode 100755 hpvm/test/parboil/benchmarks/kmeans/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/linear-svm/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/linear-svm/linear-svm.visc.ll
 delete mode 100644 hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/llvm-40-34.py
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/DESCRIPTION
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/main.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc
 delete mode 100755 hpvm/test/parboil/benchmarks/merge-tests/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/DESCRIPTION
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/base/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/GPU_kernels.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/GPU_kernels.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/GPU_kernels.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/GPU_kernels.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/GPU_kernels.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/UDTypes.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.h
 delete mode 100755 hpvm/test/parboil/benchmarks/mri-gridding/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/DESCRIPTION
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/recycle/base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/recycle/base/computeQ.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/recycle/base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cpu/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cpu/computeQ.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cpu/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cuda/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cuda/computeQ.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/cuda/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/omp_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/omp_base/computeQ.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/omp_base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/kernels.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/macros.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/kernels.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/macros.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.c
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.h
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/visc/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/mri-q/src/visc/macros.h
 delete mode 100755 hpvm/test/parboil/benchmarks/mri-q/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/io.cc
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/main.cc
 delete mode 100755 hpvm/test/parboil/benchmarks/nodeSwap/tools/compare-output
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/DESCRIPTION
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/base/sad_cpu.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cpu/sad_cpu.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/main.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.cu
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main_debug.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/file.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/file.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/image.c
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/image.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/main.cpp
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/sad.h
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.cl
 delete mode 100644 hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.h
 delete mode 100755 hpvm/test/parboil/benchmarks/sad/tools/compare-output
 delete mode 100755 hpvm/test/parboil/benchmarks/sad/tools/compute-one-sad.py
 delete mode 100644 hpvm/test/parboil/benchmarks/saxpy_test/Makefile
 delete mode 100644 hpvm/test/parboil/benchmarks/saxpy_test/src/defs.h
 delete mode 100644 hpvm/test/parboil/benchmarks/saxpy_test/src/main.c
 delete mode 100644 hpvm/test/parboil/benchmarks/saxpy_test/src/visc.h

diff --git a/hpvm/test/parboil/benchmarks/histo/DESCRIPTION b/hpvm/test/parboil/benchmarks/histo/DESCRIPTION
deleted file mode 100644
index 30d9ebd0c0..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/DESCRIPTION
+++ /dev/null
@@ -1,3 +0,0 @@
-The histogram application computes the two dimensional histogram of a two dimensional input. The input has the property of being concentrated mostly in the center of the histogram.
-
-Currently maintained by John Stratton <stratton@crhc.illinois.edu>
diff --git a/hpvm/test/parboil/benchmarks/histo/Makefile b/hpvm/test/parboil/benchmarks/histo/Makefile
deleted file mode 100644
index 16a0b85e34..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = histo
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = visc
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = default
-endif
-
-ifeq ($(PLATFORM),)
-PLATFORM=default
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-INPUT = $(DATASET_DIR)/$(TEST)/input/img.bin
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/ref.bmp
-RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/ref.bmp
-
-ifeq ($(TEST),default)
-  ITERATIONS = 20
-else ifeq ($(TEST),large)
-  ITERATIONS = 10000
-endif
-
-ARGS = -i $(INPUT) -o $(OUTPUT) $(ITERATIONS) 4
-TOOL = tools/compare-output
-#TOOL=echo
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/histo/histo.host.ll b/hpvm/test/parboil/benchmarks/histo/histo.host.ll
deleted file mode 100644
index 1af31e3b04..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/histo.host.ll
+++ /dev/null
@@ -1,1362 +0,0 @@
-; ModuleID = 'build/visc_default/main.visc.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%struct.PrescanRootIn = type <{ i32*, i64, i32, i32*, i64, i32, i32 }>
-%struct.IntermediatesRootIn = type <{ i32*, i64, i32, i32, i32, i8*, i64, i32, i32 }>
-%struct.MainRootIn = type <{ i8*, i64, i32, i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32, i32, i32, i32 }>
-%struct.FinalRootIn = type <{ i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32*, i64, i32, i32 }>
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%struct.pb_Parameters = type { i8*, i8** }
-%struct.out._Z17PrescanAllocationi = type <{ i8*, i64, i8*, i64 }>
-%emptyStruct.9 = type <{}>
-%emptyStruct.10 = type <{}>
-%struct.uint2 = type <{ i32, i32 }>
-%struct.uchar4 = type <{ i8, i8, i8, i8 }>
-%struct.out._Z14MainAllocationi = type <{ i8*, i64 }>
-%emptyStruct.11 = type <{}>
-%emptyStruct.12 = type <{}>
-%emptyStruct = type <{}>
-%emptyStruct.2 = type <{}>
-%emptyStruct.3 = type <{}>
-%emptyStruct.4 = type <{}>
-%emptyStruct.5 = type <{}>
-%emptyStruct.6 = type <{}>
-%emptyStruct.7 = type <{}>
-%emptyStruct.8 = type <{}>
-
-@.str = private unnamed_addr constant [21 x i8] c"Input file expected\0A\00", align 1
-@stderr = external global %struct._IO_FILE*
-@_ZZ4mainE12viscOverhead = private unnamed_addr constant [13 x i8] c"viscOverhead\00", align 1
-@_ZZ4mainE8prescans = private unnamed_addr constant [14 x i8] c"PreScanKernel\00", align 1
-@_ZZ4mainE11postpremems = private unnamed_addr constant [12 x i8] c"PostPreMems\00", align 1
-@_ZZ4mainE13intermediates = private unnamed_addr constant [20 x i8] c"IntermediatesKernel\00", align 16
-@_ZZ4mainE5mains = private unnamed_addr constant [11 x i8] c"MainKernel\00", align 1
-@_ZZ4mainE6finals = private unnamed_addr constant [12 x i8] c"FinalKernel\00", align 1
-@.str1 = private unnamed_addr constant [45 x i8] c"Expected at least one command line argument\0A\00", align 1
-@.str2 = private unnamed_addr constant [3 x i8] c"rb\00", align 1
-@.str3 = private unnamed_addr constant [53 x i8] c"Error reading input and output dimensions from file\0A\00", align 1
-@.str4 = private unnamed_addr constant [37 x i8] c"Error reading input array from file\0A\00", align 1
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-@viscTimerSet_NVPTX = common global i8* null
-@1 = internal constant [12 x i8] c"NVPTX_Timer\00"
-@Filename = internal constant [40 x i8] c"build/visc_default/main.visc.ll.nvptx.s\00"
-@KernelName = internal constant [21 x i8] c"histo_prescan_kernel\00"
-@graph_Z11PrescanLeafPjmiS_mPfmS0_m.addr = common global i8* null
-@Filename4 = internal constant [40 x i8] c"build/visc_default/main.visc.ll.nvptx.s\00"
-@KernelName5 = internal constant [27 x i8] c"histo_intermediates_kernel\00"
-@graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m.addr = common global i8* null
-@Filename8 = internal constant [40 x i8] c"build/visc_default/main.visc.ll.nvptx.s\00"
-@KernelName9 = internal constant [18 x i8] c"histo_main_kernel\00"
-@graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m.addr = common global i8* null
-@Filename11 = internal constant [40 x i8] c"build/visc_default/main.visc.ll.nvptx.s\00"
-@KernelName12 = internal constant [19 x i8] c"histo_final_kernel\00"
-@graph_Z9FinalLeafjjjjPjmS_mS_mS_m.addr = common global i8* null
-@viscTimerSet_X86 = common global i8* null
-@2 = internal constant [10 x i8] c"X86_Timer\00"
-
-; Function Attrs: nounwind uwtable
-define void @_Z15PrescanPackDataP13PrescanRootInPjmiS1_mii(%struct.PrescanRootIn* nocapture %args, i32* %input, i64 %bytes_input, i32 %size, i32* %minmax, i64 %bytes_minmax, i32 %block, i32 %grid) #0 {
-entry:
-  %input1 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 0
-  store i32* %input, i32** %input1, align 1, !tbaa !12
-  %bytes_input2 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 1
-  store i64 %bytes_input, i64* %bytes_input2, align 1, !tbaa !15
-  %size3 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 2
-  store i32 %size, i32* %size3, align 1, !tbaa !16
-  %minmax4 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 3
-  store i32* %minmax, i32** %minmax4, align 1, !tbaa !12
-  %bytes_minmax5 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 4
-  store i64 %bytes_minmax, i64* %bytes_minmax5, align 1, !tbaa !15
-  %block6 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 5
-  store i32 %block, i32* %block6, align 1, !tbaa !16
-  %grid7 = getelementptr inbounds %struct.PrescanRootIn* %args, i64 0, i32 6
-  store i32 %grid, i32* %grid7, align 1, !tbaa !16
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1
-
-; Function Attrs: nounwind uwtable
-define void @_Z21IntermediatesPackDataP19IntermediatesRootInPjmjjjPhmii(%struct.IntermediatesRootIn* nocapture %args, i32* %input, i64 %bytes_input, i32 %height, i32 %width, i32 %input_pitch, i8* %sm_mappings, i64 %bytes_sm_mappings, i32 %block, i32 %grid) #0 {
-entry:
-  %input1 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 0
-  store i32* %input, i32** %input1, align 1, !tbaa !12
-  %bytes_input2 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 1
-  store i64 %bytes_input, i64* %bytes_input2, align 1, !tbaa !15
-  %height3 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 2
-  store i32 %height, i32* %height3, align 1, !tbaa !16
-  %width4 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 3
-  store i32 %width, i32* %width4, align 1, !tbaa !16
-  %input_pitch5 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 4
-  store i32 %input_pitch, i32* %input_pitch5, align 1, !tbaa !16
-  %sm_mappings6 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 5
-  store i8* %sm_mappings, i8** %sm_mappings6, align 1, !tbaa !12
-  %bytes_sm_mappings7 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 6
-  store i64 %bytes_sm_mappings, i64* %bytes_sm_mappings7, align 1, !tbaa !15
-  %block8 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 7
-  store i32 %block, i32* %block8, align 1, !tbaa !16
-  %grid9 = getelementptr inbounds %struct.IntermediatesRootIn* %args, i64 0, i32 8
-  store i32 %grid, i32* %grid9, align 1, !tbaa !16
-  ret void
-}
-
-; Function Attrs: nounwind uwtable
-define void @_Z12MainPackDataP10MainRootInPhmjjjjjPjmS2_mS2_miiii(%struct.MainRootIn* nocapture %args, i8* %sm_mappings, i64 %bytes_sm_mappings, i32 %num_elements, i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32 %blockx, i32 %blocky, i32 %gridx, i32 %gridy) #0 {
-entry:
-  %sm_mappings1 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 0
-  store i8* %sm_mappings, i8** %sm_mappings1, align 1, !tbaa !12
-  %bytes_sm_mappings2 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 1
-  store i64 %bytes_sm_mappings, i64* %bytes_sm_mappings2, align 1, !tbaa !15
-  %num_elements3 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 2
-  store i32 %num_elements, i32* %num_elements3, align 1, !tbaa !16
-  %sm_range_min4 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 3
-  store i32 %sm_range_min, i32* %sm_range_min4, align 1, !tbaa !16
-  %sm_range_max5 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 4
-  store i32 %sm_range_max, i32* %sm_range_max5, align 1, !tbaa !16
-  %histo_height6 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 5
-  store i32 %histo_height, i32* %histo_height6, align 1, !tbaa !16
-  %histo_width7 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 6
-  store i32 %histo_width, i32* %histo_width7, align 1, !tbaa !16
-  %global_subhisto8 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 7
-  store i32* %global_subhisto, i32** %global_subhisto8, align 1, !tbaa !12
-  %bytes_global_subhisto9 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 8
-  store i64 %bytes_global_subhisto, i64* %bytes_global_subhisto9, align 1, !tbaa !15
-  %global_histo10 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 9
-  store i32* %global_histo, i32** %global_histo10, align 1, !tbaa !12
-  %bytes_global_histo11 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 10
-  store i64 %bytes_global_histo, i64* %bytes_global_histo11, align 1, !tbaa !15
-  %global_overflow12 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 11
-  store i32* %global_overflow, i32** %global_overflow12, align 1, !tbaa !12
-  %bytes_global_overflow13 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 12
-  store i64 %bytes_global_overflow, i64* %bytes_global_overflow13, align 1, !tbaa !15
-  %blockx14 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 13
-  store i32 %blockx, i32* %blockx14, align 1, !tbaa !16
-  %blocky15 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 14
-  store i32 %blocky, i32* %blocky15, align 1, !tbaa !16
-  %gridx16 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 15
-  store i32 %gridx, i32* %gridx16, align 1, !tbaa !16
-  %gridy17 = getelementptr inbounds %struct.MainRootIn* %args, i64 0, i32 16
-  store i32 %gridy, i32* %gridy17, align 1, !tbaa !16
-  ret void
-}
-
-; Function Attrs: nounwind uwtable
-define void @_Z13FinalPackDataP11FinalRootInjjjjPjmS1_mS1_mS1_mii(%struct.FinalRootIn* nocapture %args, i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32* %final_histo, i64 %bytes_final_histo, i32 %block, i32 %grid) #0 {
-entry:
-  %sm_range_min1 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 0
-  store i32 %sm_range_min, i32* %sm_range_min1, align 1, !tbaa !16
-  %sm_range_max2 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 1
-  store i32 %sm_range_max, i32* %sm_range_max2, align 1, !tbaa !16
-  %histo_height3 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 2
-  store i32 %histo_height, i32* %histo_height3, align 1, !tbaa !16
-  %histo_width4 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 3
-  store i32 %histo_width, i32* %histo_width4, align 1, !tbaa !16
-  %global_subhisto5 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 4
-  store i32* %global_subhisto, i32** %global_subhisto5, align 1, !tbaa !12
-  %bytes_global_subhisto6 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 5
-  store i64 %bytes_global_subhisto, i64* %bytes_global_subhisto6, align 1, !tbaa !15
-  %global_histo7 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 6
-  store i32* %global_histo, i32** %global_histo7, align 1, !tbaa !12
-  %bytes_global_histo8 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 7
-  store i64 %bytes_global_histo, i64* %bytes_global_histo8, align 1, !tbaa !15
-  %global_overflow9 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 8
-  store i32* %global_overflow, i32** %global_overflow9, align 1, !tbaa !12
-  %bytes_global_overflow10 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 9
-  store i64 %bytes_global_overflow, i64* %bytes_global_overflow10, align 1, !tbaa !15
-  %final_histo11 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 10
-  store i32* %final_histo, i32** %final_histo11, align 1, !tbaa !12
-  %bytes_final_histo12 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 11
-  store i64 %bytes_final_histo, i64* %bytes_final_histo12, align 1, !tbaa !15
-  %block13 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 12
-  store i32 %block, i32* %block13, align 1, !tbaa !16
-  %grid14 = getelementptr inbounds %struct.FinalRootIn* %args, i64 0, i32 13
-  store i32 %grid, i32* %grid14, align 1, !tbaa !16
-  ret void
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  %viscOverhead = alloca [13 x i8], align 1
-  %prescans = alloca [14 x i8], align 1
-  %postpremems = alloca [12 x i8], align 1
-  %memsets = alloca i64, align 8
-  %intermediates = alloca [20 x i8], align 16
-  %mains = alloca [11 x i8], align 1
-  %finals = alloca [12 x i8], align 1
-  %img_width = alloca i32, align 4
-  %img_height = alloca i32, align 4
-  %histo_width = alloca i32, align 4
-  %histo_height = alloca i32, align 4
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !16
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %tobool = icmp eq %struct.pb_Parameters* %call, null
-  br i1 %tobool, label %cleanup, label %if.end
-
-if.end:                                           ; preds = %entry
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
-  %1 = load i8*** %inpFiles, align 8, !tbaa !12
-  %2 = load i8** %1, align 8, !tbaa !12
-  %tobool1 = icmp eq i8* %2, null
-  br i1 %tobool1, label %if.then2, label %if.end4
-
-if.then2:                                         ; preds = %if.end
-  %3 = load %struct._IO_FILE** @stderr, align 8, !tbaa !12
-  %4 = call i64 @fwrite(i8* getelementptr inbounds ([21 x i8]* @.str, i64 0, i64 0), i64 20, i64 1, %struct._IO_FILE* %3)
-  br label %cleanup
-
-if.end4:                                          ; preds = %if.end
-  %5 = getelementptr inbounds [13 x i8]* %viscOverhead, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* getelementptr inbounds ([13 x i8]* @_ZZ4mainE12viscOverhead, i64 0, i64 0), i64 13, i32 1, i1 false)
-  %6 = getelementptr inbounds [14 x i8]* %prescans, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* getelementptr inbounds ([14 x i8]* @_ZZ4mainE8prescans, i64 0, i64 0), i64 14, i32 1, i1 false)
-  %7 = getelementptr inbounds [12 x i8]* %postpremems, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* getelementptr inbounds ([12 x i8]* @_ZZ4mainE11postpremems, i64 0, i64 0), i64 12, i32 1, i1 false)
-  store i64 32497601398793549, i64* %memsets, align 8
-  %8 = getelementptr inbounds [20 x i8]* %intermediates, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* getelementptr inbounds ([20 x i8]* @_ZZ4mainE13intermediates, i64 0, i64 0), i64 20, i32 16, i1 false)
-  %9 = getelementptr inbounds [11 x i8]* %mains, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* getelementptr inbounds ([11 x i8]* @_ZZ4mainE5mains, i64 0, i64 0), i64 11, i32 1, i1 false)
-  %10 = getelementptr inbounds [12 x i8]* %finals, i64 0, i64 0
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %10, i8* getelementptr inbounds ([12 x i8]* @_ZZ4mainE6finals, i64 0, i64 0), i64 12, i32 1, i1 false)
-  %11 = load i32* %argc.addr, align 4, !tbaa !16
-  %cmp = icmp sgt i32 %11, 1
-  br i1 %cmp, label %if.then5, label %if.else
-
-if.then5:                                         ; preds = %if.end4
-  %arrayidx6 = getelementptr inbounds i8** %argv, i64 1
-  %12 = load i8** %arrayidx6, align 8, !tbaa !12
-  %call7 = call i32 @atoi(i8* %12) #5
-  %13 = load i8*** %inpFiles, align 8, !tbaa !12
-  %14 = load i8** %13, align 8, !tbaa !12
-  %call12 = call %struct._IO_FILE* @fopen(i8* %14, i8* getelementptr inbounds ([3 x i8]* @.str2, i64 0, i64 0)) #1
-  %15 = bitcast i32* %img_width to i8*
-  %call13 = call i64 @fread(i8* %15, i64 4, i64 1, %struct._IO_FILE* %call12) #1
-  %16 = bitcast i32* %img_height to i8*
-  %call15 = call i64 @fread(i8* %16, i64 4, i64 1, %struct._IO_FILE* %call12) #1
-  %add17 = add i64 %call15, %call13
-  %17 = bitcast i32* %histo_width to i8*
-  %call19 = call i64 @fread(i8* %17, i64 4, i64 1, %struct._IO_FILE* %call12) #1
-  %add21 = add i64 %add17, %call19
-  %18 = bitcast i32* %histo_height to i8*
-  %call23 = call i64 @fread(i8* %18, i64 4, i64 1, %struct._IO_FILE* %call12) #1
-  %add25 = add i64 %add21, %call23
-  %conv26 = trunc i64 %add25 to i32
-  %cmp27 = icmp eq i32 %conv26, 4
-  br i1 %cmp27, label %if.end30, label %if.then28
-
-if.else:                                          ; preds = %if.end4
-  %19 = load %struct._IO_FILE** @stderr, align 8, !tbaa !12
-  %20 = call i64 @fwrite(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i64 44, i64 1, %struct._IO_FILE* %19)
-  br label %cleanup
-
-if.then28:                                        ; preds = %if.then5
-  %21 = load %struct._IO_FILE** @stderr, align 8, !tbaa !12
-  %22 = call i64 @fwrite(i8* getelementptr inbounds ([53 x i8]* @.str3, i64 0, i64 0), i64 52, i64 1, %struct._IO_FILE* %21)
-  br label %cleanup
-
-if.end30:                                         ; preds = %if.then5
-  %23 = load i32* %img_width, align 4, !tbaa !16
-  %24 = load i32* %img_height, align 4, !tbaa !16
-  %mul = mul i32 %24, %23
-  %conv31 = zext i32 %mul to i64
-  %mul32 = shl nuw nsw i64 %conv31, 2
-  %call33 = call noalias i8* @malloc(i64 %mul32) #1
-  %25 = bitcast i8* %call33 to i32*
-  %call39 = call i64 @fread(i8* %call33, i64 4, i64 %conv31, %struct._IO_FILE* %call12) #1
-  %conv40 = trunc i64 %call39 to i32
-  %call41 = call i32 @fclose(%struct._IO_FILE* %call12) #1
-  %cmp43 = icmp eq i32 %conv40, %mul
-  br i1 %cmp43, label %if.end46, label %if.then44
-
-if.then44:                                        ; preds = %if.end30
-  %26 = load %struct._IO_FILE** @stderr, align 8, !tbaa !12
-  %27 = call i64 @fwrite(i8* getelementptr inbounds ([37 x i8]* @.str4, i64 0, i64 0), i64 36, i64 1, %struct._IO_FILE* %26)
-  br label %cleanup
-
-if.end46:                                         ; preds = %if.end30
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %28 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %28, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  %29 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %29, i8** @viscTimerSet_NVPTX
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 8)
-  %30 = call i8* @llvm_visc_ocl_initContext(i32 2)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %FilenamePtr = getelementptr [40 x i8]* @Filename, i64 0, i64 0
-  %KernelNamePtr = getelementptr [21 x i8]* @KernelName, i64 0, i64 0
-  %graph_Z11PrescanLeafPjmiS_mPfmS0_m = call i8* @llvm_visc_ocl_launch(i8* %FilenamePtr, i8* %KernelNamePtr)
-  store i8* %graph_Z11PrescanLeafPjmiS_mPfmS0_m, i8** @graph_Z11PrescanLeafPjmiS_mPfmS0_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %FilenamePtr2 = getelementptr [40 x i8]* @Filename4, i64 0, i64 0
-  %KernelNamePtr3 = getelementptr [27 x i8]* @KernelName5, i64 0, i64 0
-  %graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m = call i8* @llvm_visc_ocl_launch(i8* %FilenamePtr2, i8* %KernelNamePtr3)
-  store i8* %graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8** @graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %FilenamePtr4 = getelementptr [40 x i8]* @Filename8, i64 0, i64 0
-  %KernelNamePtr5 = getelementptr [18 x i8]* @KernelName9, i64 0, i64 0
-  %graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m = call i8* @llvm_visc_ocl_launch(i8* %FilenamePtr4, i8* %KernelNamePtr5)
-  store i8* %graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8** @graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %FilenamePtr6 = getelementptr [40 x i8]* @Filename11, i64 0, i64 0
-  %KernelNamePtr7 = getelementptr [19 x i8]* @KernelName12, i64 0, i64 0
-  %graph_Z9FinalLeafjjjjPjmS_mS_mS_m = call i8* @llvm_visc_ocl_launch(i8* %FilenamePtr6, i8* %KernelNamePtr7)
-  store i8* %graph_Z9FinalLeafjjjjPjmS_mS_mS_m, i8** @graph_Z9FinalLeafjjjjPjmS_mS_mS_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  %31 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %31, i8** @viscTimerSet_X86
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 0)
-  call void undef()
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %5, i32 21) #1
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %6, i32 21) #1
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %7, i32 21) #1
-  %arraydecay49 = bitcast i64* %memsets to i8*
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %arraydecay49, i32 21) #1
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %8, i32 21) #1
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %9, i32 21) #1
-  call void @pb_AddSubTimer(%struct.pb_TimerSet* %timers, i8* %10, i32 21) #1
-  %add53 = add i32 %23, 1
-  %div = and i32 %add53, -2
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %sub = add i32 %24, 15
-  %div56 = and i32 %sub, -16
-  %mul58 = mul i32 %div56, %div
-  %conv59 = zext i32 %mul58 to i64
-  %mul60 = shl nuw nsw i64 %conv59, 2
-  %mul61 = shl i32 %24, 2
-  %mul62 = mul i32 %mul61, %23
-  %conv63 = zext i32 %mul62 to i64
-  %32 = load i32* %histo_height, align 4, !tbaa !16
-  %mul65 = mul i32 %32, %23
-  %conv66 = zext i32 %mul65 to i64
-  %mul67 = shl nuw nsw i64 %conv66, 2
-  %mul70 = shl nuw nsw i64 %conv66, 1
-  %call77 = call noalias i8* @malloc(i64 %mul60) #1
-  %33 = bitcast i8* %call77 to i32*
-  %call78 = call noalias i8* @malloc(i64 8) #1
-  %34 = bitcast i8* %call78 to i32*
-  %call79 = call noalias i8* @malloc(i64 %conv63) #1
-  %call80 = call noalias i8* @malloc(i64 %mul67) #1
-  %35 = bitcast i8* %call80 to i32*
-  %call81 = call noalias i8* @malloc(i64 %mul70) #1
-  %call82 = call noalias i8* @malloc(i64 %mul67) #1
-  %36 = bitcast i8* %call82 to i32*
-  %call83 = call noalias i8* @malloc(i64 %conv66) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 15) #1
-  call void @llvm_visc_track_mem(i8* %call77, i64 %mul60) #1
-  call void @llvm_visc_track_mem(i8* %call78, i64 8) #1
-  call void @llvm_visc_track_mem(i8* %call79, i64 %conv63) #1
-  call void @llvm_visc_track_mem(i8* %call80, i64 %mul67) #1
-  call void @llvm_visc_track_mem(i8* %call81, i64 %mul70) #1
-  call void @llvm_visc_track_mem(i8* %call82, i64 %mul67) #1
-  call void @llvm_visc_track_mem(i8* %call83, i64 %conv66) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %cmp84253 = icmp eq i32 %24, 0
-  br i1 %cmp84253, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %if.end46
-  %conv90 = zext i32 %23 to i64
-  %mul91 = shl nuw nsw i64 %conv90, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %37 = trunc i64 %indvars.iv to i32
-  %mul85 = mul nsw i32 %37, %div
-  %idxprom = sext i32 %mul85 to i64
-  %arrayidx86 = getelementptr inbounds i32* %33, i64 %idxprom
-  %38 = bitcast i32* %arrayidx86 to i8*
-  %mul87 = mul i32 %23, %37
-  %idxprom88 = zext i32 %mul87 to i64
-  %arrayidx89 = getelementptr inbounds i32* %25, i64 %idxprom88
-  %39 = bitcast i32* %arrayidx89 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %38, i8* %39, i64 %mul91, i32 4, i1 false)
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %40 = trunc i64 %indvars.iv.next to i32
-  %cmp84 = icmp ult i32 %40, %24
-  br i1 %cmp84, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  %phitmp = add i32 %24, 15
-  %phitmp255 = lshr i32 %phitmp, 4
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %if.end46
-  %41 = phi i32 [ %24, %for.cond.for.end_crit_edge ], [ 0, %if.end46 ]
-  %.lcssa = phi i32 [ %phitmp255, %for.cond.for.end_crit_edge ], [ 0, %if.end46 ]
-  %div93 = lshr i32 %add53, 1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 19) #1
-  %mul98 = mul i32 %23, %41
-  %call99 = call noalias i8* @malloc(i64 44) #1
-  %input1.i = bitcast i8* %call99 to i32**
-  store i32* %33, i32** %input1.i, align 1, !tbaa !12
-  %bytes_input2.i = getelementptr inbounds i8* %call99, i64 8
-  %42 = bitcast i8* %bytes_input2.i to i64*
-  store i64 %mul60, i64* %42, align 1, !tbaa !15
-  %size3.i = getelementptr inbounds i8* %call99, i64 16
-  %43 = bitcast i8* %size3.i to i32*
-  store i32 %mul98, i32* %43, align 1, !tbaa !16
-  %minmax4.i = getelementptr inbounds i8* %call99, i64 20
-  %44 = bitcast i8* %minmax4.i to i32**
-  store i32* %34, i32** %44, align 1, !tbaa !12
-  %bytes_minmax5.i = getelementptr inbounds i8* %call99, i64 28
-  %45 = bitcast i8* %bytes_minmax5.i to i64*
-  store i64 8, i64* %45, align 1, !tbaa !15
-  %block6.i = getelementptr inbounds i8* %call99, i64 36
-  %46 = bitcast i8* %block6.i to i32*
-  store i32 512, i32* %46, align 1, !tbaa !16
-  %grid7.i = getelementptr inbounds i8* %call99, i64 40
-  %47 = bitcast i8* %grid7.i to i32*
-  store i32 64, i32* %47, align 1, !tbaa !16
-  %call102 = call noalias i8* @malloc(i64 52) #1
-  %input1.i249 = bitcast i8* %call102 to i32**
-  store i32* %33, i32** %input1.i249, align 1, !tbaa !12
-  %bytes_input2.i250 = getelementptr inbounds i8* %call102, i64 8
-  %48 = bitcast i8* %bytes_input2.i250 to i64*
-  store i64 %mul60, i64* %48, align 1, !tbaa !15
-  %height3.i = getelementptr inbounds i8* %call102, i64 16
-  %49 = bitcast i8* %height3.i to i32*
-  store i32 %41, i32* %49, align 1, !tbaa !16
-  %width4.i = getelementptr inbounds i8* %call102, i64 20
-  %50 = bitcast i8* %width4.i to i32*
-  store i32 %23, i32* %50, align 1, !tbaa !16
-  %input_pitch5.i = getelementptr inbounds i8* %call102, i64 24
-  %51 = bitcast i8* %input_pitch5.i to i32*
-  store i32 %div93, i32* %51, align 1, !tbaa !16
-  %sm_mappings6.i = getelementptr inbounds i8* %call102, i64 28
-  %52 = bitcast i8* %sm_mappings6.i to i8**
-  store i8* %call79, i8** %52, align 1, !tbaa !12
-  %bytes_sm_mappings7.i = getelementptr inbounds i8* %call102, i64 36
-  %53 = bitcast i8* %bytes_sm_mappings7.i to i64*
-  store i64 %conv63, i64* %53, align 1, !tbaa !15
-  %block8.i = getelementptr inbounds i8* %call102, i64 44
-  %54 = bitcast i8* %block8.i to i32*
-  store i32 %div93, i32* %54, align 1, !tbaa !16
-  %grid9.i = getelementptr inbounds i8* %call102, i64 48
-  %55 = bitcast i8* %grid9.i to i32*
-  store i32 %.lcssa, i32* %55, align 1, !tbaa !16
-  %call103 = call noalias i8* @malloc(i64 100) #1
-  %56 = load i32* %34, align 4, !tbaa !16
-  %arrayidx105 = getelementptr inbounds i8* %call78, i64 4
-  %57 = bitcast i8* %arrayidx105 to i32*
-  %58 = load i32* %57, align 4, !tbaa !16
-  %59 = load i32* %histo_width, align 4, !tbaa !16
-  %60 = bitcast i8* %call81 to i32*
-  %sm_mappings1.i = bitcast i8* %call103 to i8**
-  store i8* %call79, i8** %sm_mappings1.i, align 1, !tbaa !12
-  %bytes_sm_mappings2.i = getelementptr inbounds i8* %call103, i64 8
-  %61 = bitcast i8* %bytes_sm_mappings2.i to i64*
-  store i64 %conv63, i64* %61, align 1, !tbaa !15
-  %num_elements3.i = getelementptr inbounds i8* %call103, i64 16
-  %62 = bitcast i8* %num_elements3.i to i32*
-  store i32 %mul98, i32* %62, align 1, !tbaa !16
-  %sm_range_min4.i = getelementptr inbounds i8* %call103, i64 20
-  %63 = bitcast i8* %sm_range_min4.i to i32*
-  store i32 %56, i32* %63, align 1, !tbaa !16
-  %sm_range_max5.i = getelementptr inbounds i8* %call103, i64 24
-  %64 = bitcast i8* %sm_range_max5.i to i32*
-  store i32 %58, i32* %64, align 1, !tbaa !16
-  %histo_height6.i = getelementptr inbounds i8* %call103, i64 28
-  %65 = bitcast i8* %histo_height6.i to i32*
-  store i32 %32, i32* %65, align 1, !tbaa !16
-  %histo_width7.i = getelementptr inbounds i8* %call103, i64 32
-  %66 = bitcast i8* %histo_width7.i to i32*
-  store i32 %59, i32* %66, align 1, !tbaa !16
-  %global_subhisto8.i = getelementptr inbounds i8* %call103, i64 36
-  %67 = bitcast i8* %global_subhisto8.i to i32**
-  store i32* %35, i32** %67, align 1, !tbaa !12
-  %bytes_global_subhisto9.i = getelementptr inbounds i8* %call103, i64 44
-  %68 = bitcast i8* %bytes_global_subhisto9.i to i64*
-  store i64 %mul67, i64* %68, align 1, !tbaa !15
-  %global_histo10.i = getelementptr inbounds i8* %call103, i64 52
-  %69 = bitcast i8* %global_histo10.i to i32**
-  store i32* %60, i32** %69, align 1, !tbaa !12
-  %bytes_global_histo11.i = getelementptr inbounds i8* %call103, i64 60
-  %70 = bitcast i8* %bytes_global_histo11.i to i64*
-  store i64 %mul70, i64* %70, align 1, !tbaa !15
-  %global_overflow12.i = getelementptr inbounds i8* %call103, i64 68
-  %71 = bitcast i8* %global_overflow12.i to i32**
-  store i32* %36, i32** %71, align 1, !tbaa !12
-  %bytes_global_overflow13.i = getelementptr inbounds i8* %call103, i64 76
-  %72 = bitcast i8* %bytes_global_overflow13.i to i64*
-  store i64 %mul67, i64* %72, align 1, !tbaa !15
-  %blockx14.i = getelementptr inbounds i8* %call103, i64 84
-  %73 = bitcast i8* %blockx14.i to i32*
-  store i32 768, i32* %73, align 1, !tbaa !16
-  %blocky15.i = getelementptr inbounds i8* %call103, i64 88
-  %74 = bitcast i8* %blocky15.i to i32*
-  store i32 1, i32* %74, align 1, !tbaa !16
-  %gridx16.i = getelementptr inbounds i8* %call103, i64 92
-  %75 = bitcast i8* %gridx16.i to i32*
-  store i32 14, i32* %75, align 1, !tbaa !16
-  %call110 = call noalias i8* @malloc(i64 88) #1
-  %76 = load i32* %histo_height, align 4, !tbaa !16
-  %77 = bitcast i8* %call83 to i32*
-  %sm_range_min1.i = bitcast i8* %call110 to i32*
-  store i32 %56, i32* %sm_range_min1.i, align 1, !tbaa !16
-  %sm_range_max2.i = getelementptr inbounds i8* %call110, i64 4
-  %78 = bitcast i8* %sm_range_max2.i to i32*
-  store i32 %58, i32* %78, align 1, !tbaa !16
-  %histo_height3.i = getelementptr inbounds i8* %call110, i64 8
-  %79 = bitcast i8* %histo_height3.i to i32*
-  store i32 %76, i32* %79, align 1, !tbaa !16
-  %histo_width4.i = getelementptr inbounds i8* %call110, i64 12
-  %80 = bitcast i8* %histo_width4.i to i32*
-  store i32 %59, i32* %80, align 1, !tbaa !16
-  %global_subhisto5.i = getelementptr inbounds i8* %call110, i64 16
-  %81 = bitcast i8* %global_subhisto5.i to i32**
-  store i32* %35, i32** %81, align 1, !tbaa !12
-  %bytes_global_subhisto6.i = getelementptr inbounds i8* %call110, i64 24
-  %82 = bitcast i8* %bytes_global_subhisto6.i to i64*
-  store i64 %mul67, i64* %82, align 1, !tbaa !15
-  %global_histo7.i = getelementptr inbounds i8* %call110, i64 32
-  %83 = bitcast i8* %global_histo7.i to i32**
-  store i32* %60, i32** %83, align 1, !tbaa !12
-  %bytes_global_histo8.i = getelementptr inbounds i8* %call110, i64 40
-  %84 = bitcast i8* %bytes_global_histo8.i to i64*
-  store i64 %mul70, i64* %84, align 1, !tbaa !15
-  %global_overflow9.i = getelementptr inbounds i8* %call110, i64 48
-  %85 = bitcast i8* %global_overflow9.i to i32**
-  store i32* %36, i32** %85, align 1, !tbaa !12
-  %bytes_global_overflow10.i = getelementptr inbounds i8* %call110, i64 56
-  %86 = bitcast i8* %bytes_global_overflow10.i to i64*
-  store i64 %mul67, i64* %86, align 1, !tbaa !15
-  %final_histo11.i = getelementptr inbounds i8* %call110, i64 64
-  %87 = bitcast i8* %final_histo11.i to i32**
-  store i32* %77, i32** %87, align 1, !tbaa !12
-  %bytes_final_histo12.i = getelementptr inbounds i8* %call110, i64 72
-  %88 = bitcast i8* %bytes_final_histo12.i to i64*
-  store i64 %conv66, i64* %88, align 1, !tbaa !15
-  %block13.i = getelementptr inbounds i8* %call110, i64 80
-  %89 = bitcast i8* %block13.i to i32*
-  store i32 512, i32* %89, align 1, !tbaa !16
-  %grid14.i = getelementptr inbounds i8* %call110, i64 84
-  %90 = bitcast i8* %grid14.i to i32*
-  store i32 42, i32* %90, align 1, !tbaa !16
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 21) #1
-  %cmp114251 = icmp sgt i32 %call7, 0
-  br i1 %cmp114251, label %for.body115.lr.ph, label %for.end142
-
-for.body115.lr.ph:                                ; preds = %for.end
-  %gridy = getelementptr inbounds i8* %call103, i64 96
-  %91 = bitcast i8* %gridy to i32*
-  br label %for.body115
-
-for.body115:                                      ; preds = %for.body115, %for.body115.lr.ph
-  %iter.0252 = phi i32 [ 0, %for.body115.lr.ph ], [ %inc141, %for.body115 ]
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %7, i32 21) #1
-  store i32 -1, i32* %34, align 4, !tbaa !16
-  store i32 0, i32* %57, align 4, !tbaa !16
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %6, i32 21) #1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 18)
-  %graph_Z11PrescanRootPjmiS_mii = call i8* @llvm_visc_x86_launch(i8* (i8*)* @LaunchDataflowGraph, i8* %call99)
-  call void @llvm_visc_x86_wait(i8* %graph_Z11PrescanRootPjmiS_mii)
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %7, i32 21) #1
-  call void @llvm_visc_request_mem(i8* %call78, i64 8) #1
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %8, i32 21) #1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 18)
-  %graph_Z17IntermediatesRootP5uint2mjjjP6uchar4mii = call i8* @llvm_visc_x86_launch(i8* (i8*)* @LaunchDataflowGraph13, i8* %call102)
-  call void @llvm_visc_x86_wait(i8* %graph_Z17IntermediatesRootP5uint2mjjjP6uchar4mii)
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %5, i32 21) #1
-  %92 = load i32* %57, align 4, !tbaa !16
-  %93 = load i32* %34, align 4, !tbaa !16
-  %sub127 = add i32 %92, 1
-  %add128 = sub i32 %sub127, %93
-  store i32 %add128, i32* %91, align 1, !tbaa !16
-  store i32 %93, i32* %63, align 1, !tbaa !16
-  store i32 %92, i32* %64, align 1, !tbaa !16
-  store i32 %93, i32* %sm_range_min1.i, align 1, !tbaa !16
-  store i32 %92, i32* %78, align 1, !tbaa !16
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %9, i32 21) #1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 18)
-  %graph_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii = call i8* @llvm_visc_x86_launch(i8* (i8*)* @LaunchDataflowGraph14, i8* %call103)
-  call void @llvm_visc_x86_wait(i8* %graph_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii)
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %10, i32 21) #1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 18)
-  %graph_Z9FinalRootjjjjPjmS_mS_mS_mii = call i8* @llvm_visc_x86_launch(i8* (i8*)* @LaunchDataflowGraph15, i8* %call110)
-  call void @llvm_visc_x86_wait(i8* %graph_Z9FinalRootjjjjPjmS_mS_mS_mii)
-  call void @pb_SwitchToSubTimer(%struct.pb_TimerSet* %timers, i8* %5, i32 21) #1
-  %inc141 = add nsw i32 %iter.0252, 1
-  %exitcond = icmp eq i32 %inc141, %call7
-  br i1 %exitcond, label %for.end142, label %for.body115
-
-for.end142:                                       ; preds = %for.body115, %for.end
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  call void @llvm_visc_request_mem(i8* %call83, i64 %conv66) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  call void @llvm_visc_untrack_mem(i8* %call77) #1
-  call void @llvm_visc_untrack_mem(i8* %call78) #1
-  call void @llvm_visc_untrack_mem(i8* %call79) #1
-  call void @llvm_visc_untrack_mem(i8* %call80) #1
-  call void @llvm_visc_untrack_mem(i8* %call81) #1
-  call void @llvm_visc_untrack_mem(i8* %call82) #1
-  call void @llvm_visc_untrack_mem(i8* %call83) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  %Ptr1 = getelementptr [12 x i8]* @1, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_NVPTX, i8* %Ptr1)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 9)
-  %94 = load i8** @graph_Z11PrescanLeafPjmiS_mPfmS0_m.addr
-  call void @llvm_visc_ocl_clearContext(i8* %94)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 9)
-  %95 = load i8** @graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m.addr
-  call void @llvm_visc_ocl_clearContext(i8* %95)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 9)
-  %96 = load i8** @graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m.addr
-  call void @llvm_visc_ocl_clearContext(i8* %96)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 9)
-  %97 = load i8** @graph_Z9FinalLeafjjjjPjmS_mS_mS_m.addr
-  call void @llvm_visc_ocl_clearContext(i8* %97)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  %Ptr8 = getelementptr [10 x i8]* @2, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_X86, i8* %Ptr8)
-  call void undef()
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
-  %98 = load i8** %outFile, align 8, !tbaa !12
-  %tobool143 = icmp eq i8* %98, null
-  br i1 %tobool143, label %if.end146, label %if.then144
-
-if.then144:                                       ; preds = %for.end142
-  call void @_Z14dump_histo_imgPhjjPKc(i8* %call83, i32 %76, i32 %59, i8* %98) #1
-  br label %if.end146
-
-if.end146:                                        ; preds = %if.then144, %for.end142
-  call void @free(i8* %call33) #1
-  call void @free(i8* %call77) #1
-  call void @free(i8* %call78) #1
-  call void @free(i8* %call79) #1
-  call void @free(i8* %call80) #1
-  call void @free(i8* %call81) #1
-  call void @free(i8* %call82) #1
-  call void @free(i8* %call83) #1
-  %putchar = call i32 @putchar(i32 10) #1
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
-  call void @pb_DestroyTimerSet(%struct.pb_TimerSet* %timers) #1
-  br label %cleanup
-
-cleanup:                                          ; preds = %if.end146, %if.then44, %if.then28, %if.else, %if.then2, %entry
-  %retval.0 = phi i32 [ -1, %if.then28 ], [ -1, %if.then44 ], [ 0, %if.end146 ], [ -1, %if.else ], [ -1, %if.then2 ], [ -1, %entry ]
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #2
-
-; Function Attrs: nounwind readonly
-declare i32 @atoi(i8* nocapture) #3
-
-; Function Attrs: nounwind
-declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #4
-
-; Function Attrs: nounwind
-declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #4
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #4
-
-; Function Attrs: nounwind
-declare i32 @fclose(%struct._IO_FILE* nocapture) #4
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #2
-
-declare void @pb_AddSubTimer(%struct.pb_TimerSet*, i8*, i32) #2
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #2
-
-declare void @llvm_visc_track_mem(i8*, i64) #2
-
-declare void @pb_SwitchToSubTimer(%struct.pb_TimerSet*, i8*, i32) #2
-
-declare void @llvm_visc_request_mem(i8*, i64) #2
-
-declare void @llvm_visc_untrack_mem(i8*) #2
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #2
-
-declare void @_Z14dump_histo_imgPhjjPKc(i8*, i32, i32, i8*) #2
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #4
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #2
-
-declare void @pb_DestroyTimerSet(%struct.pb_TimerSet*) #2
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare i32 @putchar(i32) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.sqrt.f32(float) #5
-
-; Function Attrs: nounwind
-declare void @llvm.visc.barrier() #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.atomic.umin(i8*, i32) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.atomic.umax(i8*, i32) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.atomic.add(i8*, i32) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-declare i8* @llvm_visc_ocl_launch(i8*, i8*)
-
-declare void @llvm_visc_ocl_wait(i8*)
-
-declare i8* @llvm_visc_ocl_initContext(i32)
-
-declare void @llvm_visc_ocl_clearContext(i8*)
-
-declare void @llvm_visc_ocl_argument_shared(i8*, i32, i64)
-
-declare void @llvm_visc_ocl_argument_scalar(i8*, i8*, i32, i64)
-
-declare i8* @llvm_visc_ocl_argument_ptr(i8*, i8*, i32, i64, i1, i1)
-
-declare i8* @llvm_visc_ocl_output_ptr(i8*, i32, i64)
-
-declare void @llvm_visc_ocl_free(i8*)
-
-declare i8* @llvm_visc_ocl_getOutput(i8*, i8*, i8*, i64)
-
-declare i8* @llvm_visc_ocl_executeNode(i8*, i32, i64*, i64*)
-
-; Function Attrs: nounwind uwtable
-define %struct.out._Z17PrescanAllocationi @_Z17PrescanAllocationi1(i32 %block) #0 {
-entry:
-  %returnStruct = insertvalue %struct.out._Z17PrescanAllocationi undef, i8* null, 0
-  %returnStruct4 = insertvalue %struct.out._Z17PrescanAllocationi %returnStruct, i64 2048, 1
-  %returnStruct45 = insertvalue %struct.out._Z17PrescanAllocationi %returnStruct4, i8* null, 2
-  %returnStruct456 = insertvalue %struct.out._Z17PrescanAllocationi %returnStruct45, i64 2048, 3
-  ret %struct.out._Z17PrescanAllocationi %returnStruct456
-}
-
-define %emptyStruct.9 @_Z11PrescanRootPjmiS_mii2(i32* %input, i64 %bytes_input, i32 %size, i32* %minmax, i64 %bytes_minmax, i32 %block, i32 %grid) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %graph._Z11PrescanLeafPjmiS_mPfmS0_m = load i8** @graph_Z11PrescanLeafPjmiS_mPfmS0_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %input.i8ptr = bitcast i32* %input to i8*
-  %0 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %input.i8ptr, i32 0, i64 %bytes_input, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_input.ptr = alloca i64
-  store i64 %bytes_input, i64* %bytes_input.ptr
-  %bytes_input.i8ptr = bitcast i64* %bytes_input.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %bytes_input.i8ptr, i32 1, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %size.ptr = alloca i32
-  store i32 %size, i32* %size.ptr
-  %size.i8ptr = bitcast i32* %size.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %size.i8ptr, i32 2, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %minmax.i8ptr = bitcast i32* %minmax to i8*
-  %1 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %minmax.i8ptr, i32 3, i64 %bytes_minmax, i1 true, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_minmax.ptr = alloca i64
-  store i64 %bytes_minmax, i64* %bytes_minmax.ptr
-  %bytes_minmax.i8ptr = bitcast i64* %bytes_minmax.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %bytes_minmax.i8ptr, i32 4, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  call void @llvm_visc_ocl_argument_shared(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i32 5, i64 2048)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %.sharedMem.ptr = alloca i64
-  store i64 2048, i64* %.sharedMem.ptr
-  %.sharedMem.i8ptr = bitcast i64* %.sharedMem.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %.sharedMem.i8ptr, i32 6, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  call void @llvm_visc_ocl_argument_shared(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i32 7, i64 2048)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %.sharedMem.ptr1 = alloca i64
-  store i64 2048, i64* %.sharedMem.ptr1
-  %.sharedMem.i8ptr2 = bitcast i64* %.sharedMem.ptr1 to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i8* %.sharedMem.i8ptr2, i32 8, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  %LocalWGSize = alloca [1 x i64]
-  %LocalWGSize.0 = bitcast [1 x i64]* %LocalWGSize to i64*
-  %2 = sext i32 %block to i64
-  store i64 %2, i64* %LocalWGSize.0
-  %3 = mul i32 %grid, %block
-  %GlobalWGSize = alloca [1 x i64]
-  %GlobalWGSize.0 = bitcast [1 x i64]* %GlobalWGSize to i64*
-  %4 = sext i32 %3 to i64
-  store i64 %4, i64* %GlobalWGSize.0
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 2)
-  %event._Z11PrescanLeafPjmiS_mPfmS0_m = call i8* @llvm_visc_ocl_executeNode(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m, i32 1, i64* %LocalWGSize.0, i64* %GlobalWGSize.0)
-  call void @llvm_visc_ocl_wait(i8* %graph._Z11PrescanLeafPjmiS_mPfmS0_m)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 13)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 12)
-  call void @llvm_visc_ocl_free(i8* %0)
-  call void @llvm_visc_ocl_free(i8* %1)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  ret %emptyStruct.9 undef
-}
-
-define %emptyStruct.10 @_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3(%struct.uint2* %input, i64 %bytes_input, i32 %height, i32 %width, i32 %input_pitch, %struct.uchar4* %sm_mappings, i64 %bytes_sm_mappings, i32 %block, i32 %grid) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m = load i8** @graph_Z17IntermediatesLeafP5uint2mjjjP6uchar4m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %input.i8ptr = bitcast %struct.uint2* %input to i8*
-  %0 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %input.i8ptr, i32 0, i64 %bytes_input, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_input.ptr = alloca i64
-  store i64 %bytes_input, i64* %bytes_input.ptr
-  %bytes_input.i8ptr = bitcast i64* %bytes_input.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %bytes_input.i8ptr, i32 1, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %height.ptr = alloca i32
-  store i32 %height, i32* %height.ptr
-  %height.i8ptr = bitcast i32* %height.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %height.i8ptr, i32 2, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %width.ptr = alloca i32
-  store i32 %width, i32* %width.ptr
-  %width.i8ptr = bitcast i32* %width.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %width.i8ptr, i32 3, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %input_pitch.ptr = alloca i32
-  store i32 %input_pitch, i32* %input_pitch.ptr
-  %input_pitch.i8ptr = bitcast i32* %input_pitch.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %input_pitch.i8ptr, i32 4, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %sm_mappings.i8ptr = bitcast %struct.uchar4* %sm_mappings to i8*
-  %1 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %sm_mappings.i8ptr, i32 5, i64 %bytes_sm_mappings, i1 false, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_sm_mappings.ptr = alloca i64
-  store i64 %bytes_sm_mappings, i64* %bytes_sm_mappings.ptr
-  %bytes_sm_mappings.i8ptr = bitcast i64* %bytes_sm_mappings.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i8* %bytes_sm_mappings.i8ptr, i32 6, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  %LocalWGSize = alloca [1 x i64]
-  %LocalWGSize.0 = bitcast [1 x i64]* %LocalWGSize to i64*
-  %2 = sext i32 %block to i64
-  store i64 %2, i64* %LocalWGSize.0
-  %3 = mul i32 %grid, %block
-  %GlobalWGSize = alloca [1 x i64]
-  %GlobalWGSize.0 = bitcast [1 x i64]* %GlobalWGSize to i64*
-  %4 = sext i32 %3 to i64
-  store i64 %4, i64* %GlobalWGSize.0
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 2)
-  %event._Z17IntermediatesLeafP5uint2mjjjP6uchar4m = call i8* @llvm_visc_ocl_executeNode(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m, i32 1, i64* %LocalWGSize.0, i64* %GlobalWGSize.0)
-  call void @llvm_visc_ocl_wait(i8* %graph._Z17IntermediatesLeafP5uint2mjjjP6uchar4m)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 13)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 12)
-  call void @llvm_visc_ocl_free(i8* %0)
-  call void @llvm_visc_ocl_free(i8* %1)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  ret %emptyStruct.10 undef
-}
-
-; Function Attrs: nounwind uwtable
-define %struct.out._Z14MainAllocationi @_Z14MainAllocationi6(i32 %block) #0 {
-entry:
-  %returnStruct = insertvalue %struct.out._Z14MainAllocationi undef, i8* null, 0
-  %returnStruct2 = insertvalue %struct.out._Z14MainAllocationi %returnStruct, i64 24576, 1
-  ret %struct.out._Z14MainAllocationi %returnStruct2
-}
-
-define %emptyStruct.11 @_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7(%struct.uchar4* %sm_mappings, i64 %bytes_sm_mappings, i32 %num_elements, i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32 %blockx, i32 %blocky, i32 %gridx, i32 %gridy) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m = load i8** @graph_Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %sm_mappings.i8ptr = bitcast %struct.uchar4* %sm_mappings to i8*
-  %0 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %sm_mappings.i8ptr, i32 0, i64 %bytes_sm_mappings, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_sm_mappings.ptr = alloca i64
-  store i64 %bytes_sm_mappings, i64* %bytes_sm_mappings.ptr
-  %bytes_sm_mappings.i8ptr = bitcast i64* %bytes_sm_mappings.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %bytes_sm_mappings.i8ptr, i32 1, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %num_elements.ptr = alloca i32
-  store i32 %num_elements, i32* %num_elements.ptr
-  %num_elements.i8ptr = bitcast i32* %num_elements.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %num_elements.i8ptr, i32 2, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %sm_range_min.ptr = alloca i32
-  store i32 %sm_range_min, i32* %sm_range_min.ptr
-  %sm_range_min.i8ptr = bitcast i32* %sm_range_min.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %sm_range_min.i8ptr, i32 3, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %sm_range_max.ptr = alloca i32
-  store i32 %sm_range_max, i32* %sm_range_max.ptr
-  %sm_range_max.i8ptr = bitcast i32* %sm_range_max.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %sm_range_max.i8ptr, i32 4, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %histo_height.ptr = alloca i32
-  store i32 %histo_height, i32* %histo_height.ptr
-  %histo_height.i8ptr = bitcast i32* %histo_height.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %histo_height.i8ptr, i32 5, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %histo_width.ptr = alloca i32
-  store i32 %histo_width, i32* %histo_width.ptr
-  %histo_width.i8ptr = bitcast i32* %histo_width.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %histo_width.i8ptr, i32 6, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_subhisto.i8ptr = bitcast i32* %global_subhisto to i8*
-  %1 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %global_subhisto.i8ptr, i32 7, i64 %bytes_global_subhisto, i1 false, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_subhisto.ptr = alloca i64
-  store i64 %bytes_global_subhisto, i64* %bytes_global_subhisto.ptr
-  %bytes_global_subhisto.i8ptr = bitcast i64* %bytes_global_subhisto.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %bytes_global_subhisto.i8ptr, i32 8, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_histo.i8ptr = bitcast i32* %global_histo to i8*
-  %2 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %global_histo.i8ptr, i32 9, i64 %bytes_global_histo, i1 false, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_histo.ptr = alloca i64
-  store i64 %bytes_global_histo, i64* %bytes_global_histo.ptr
-  %bytes_global_histo.i8ptr = bitcast i64* %bytes_global_histo.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %bytes_global_histo.i8ptr, i32 10, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_overflow.i8ptr = bitcast i32* %global_overflow to i8*
-  %3 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %global_overflow.i8ptr, i32 11, i64 %bytes_global_overflow, i1 false, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_overflow.ptr = alloca i64
-  store i64 %bytes_global_overflow, i64* %bytes_global_overflow.ptr
-  %bytes_global_overflow.i8ptr = bitcast i64* %bytes_global_overflow.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %bytes_global_overflow.i8ptr, i32 12, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  call void @llvm_visc_ocl_argument_shared(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i32 13, i64 24576)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %.sharedMem.ptr = alloca i64
-  store i64 24576, i64* %.sharedMem.ptr
-  %.sharedMem.i8ptr = bitcast i64* %.sharedMem.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i8* %.sharedMem.i8ptr, i32 14, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  %LocalWGSize = alloca [2 x i64]
-  %LocalWGSize.0 = bitcast [2 x i64]* %LocalWGSize to i64*
-  %4 = sext i32 %blockx to i64
-  store i64 %4, i64* %LocalWGSize.0
-  %LocalWGSize.1 = getelementptr i64* %LocalWGSize.0, i64 1
-  %5 = sext i32 %blocky to i64
-  store i64 %5, i64* %LocalWGSize.1
-  %6 = mul i32 %gridx, %blockx
-  %7 = mul i32 %gridy, %blocky
-  %GlobalWGSize = alloca [2 x i64]
-  %GlobalWGSize.0 = bitcast [2 x i64]* %GlobalWGSize to i64*
-  %8 = sext i32 %6 to i64
-  store i64 %8, i64* %GlobalWGSize.0
-  %GlobalWGSize.1 = getelementptr i64* %GlobalWGSize.0, i64 1
-  %9 = sext i32 %7 to i64
-  store i64 %9, i64* %GlobalWGSize.1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 2)
-  %event._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m = call i8* @llvm_visc_ocl_executeNode(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m, i32 2, i64* %LocalWGSize.0, i64* %GlobalWGSize.0)
-  call void @llvm_visc_ocl_wait(i8* %graph._Z8MainLeafP6uchar4mjjjjjPjmS1_mS1_mS1_m)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 13)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 12)
-  call void @llvm_visc_ocl_free(i8* %0)
-  call void @llvm_visc_ocl_free(i8* %1)
-  call void @llvm_visc_ocl_free(i8* %2)
-  call void @llvm_visc_ocl_free(i8* %3)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  ret %emptyStruct.11 undef
-}
-
-define %emptyStruct.12 @_Z9FinalRootjjjjPjmS_mS_mS_mii10(i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32* %final_histo, i64 %bytes_final_histo, i32 %block, i32 %grid) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 14)
-  %graph._Z9FinalLeafjjjjPjmS_mS_mS_m = load i8** @graph_Z9FinalLeafjjjjPjmS_mS_mS_m.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %sm_range_min.ptr = alloca i32
-  store i32 %sm_range_min, i32* %sm_range_min.ptr
-  %sm_range_min.i8ptr = bitcast i32* %sm_range_min.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %sm_range_min.i8ptr, i32 0, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %sm_range_max.ptr = alloca i32
-  store i32 %sm_range_max, i32* %sm_range_max.ptr
-  %sm_range_max.i8ptr = bitcast i32* %sm_range_max.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %sm_range_max.i8ptr, i32 1, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %histo_height.ptr = alloca i32
-  store i32 %histo_height, i32* %histo_height.ptr
-  %histo_height.i8ptr = bitcast i32* %histo_height.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %histo_height.i8ptr, i32 2, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %histo_width.ptr = alloca i32
-  store i32 %histo_width, i32* %histo_width.ptr
-  %histo_width.i8ptr = bitcast i32* %histo_width.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %histo_width.i8ptr, i32 3, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_subhisto.i8ptr = bitcast i32* %global_subhisto to i8*
-  %0 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %global_subhisto.i8ptr, i32 4, i64 %bytes_global_subhisto, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_subhisto.ptr = alloca i64
-  store i64 %bytes_global_subhisto, i64* %bytes_global_subhisto.ptr
-  %bytes_global_subhisto.i8ptr = bitcast i64* %bytes_global_subhisto.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %bytes_global_subhisto.i8ptr, i32 5, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_histo.i8ptr = bitcast i32* %global_histo to i8*
-  %1 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %global_histo.i8ptr, i32 6, i64 %bytes_global_histo, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_histo.ptr = alloca i64
-  store i64 %bytes_global_histo, i64* %bytes_global_histo.ptr
-  %bytes_global_histo.i8ptr = bitcast i64* %bytes_global_histo.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %bytes_global_histo.i8ptr, i32 7, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %global_overflow.i8ptr = bitcast i32* %global_overflow to i8*
-  %2 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %global_overflow.i8ptr, i32 8, i64 %bytes_global_overflow, i1 true, i1 false)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_global_overflow.ptr = alloca i64
-  store i64 %bytes_global_overflow, i64* %bytes_global_overflow.ptr
-  %bytes_global_overflow.i8ptr = bitcast i64* %bytes_global_overflow.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %bytes_global_overflow.i8ptr, i32 9, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 11)
-  %final_histo.i8ptr = bitcast i32* %final_histo to i8*
-  %3 = call i8* @llvm_visc_ocl_argument_ptr(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %final_histo.i8ptr, i32 10, i64 %bytes_final_histo, i1 false, i1 true)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 10)
-  %bytes_final_histo.ptr = alloca i64
-  store i64 %bytes_final_histo, i64* %bytes_final_histo.ptr
-  %bytes_final_histo.i8ptr = bitcast i64* %bytes_final_histo.ptr to i8*
-  call void @llvm_visc_ocl_argument_scalar(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i8* %bytes_final_histo.i8ptr, i32 11, i64 ptrtoint (i64* getelementptr (i64* null, i32 1) to i64))
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  %LocalWGSize = alloca [1 x i64]
-  %LocalWGSize.0 = bitcast [1 x i64]* %LocalWGSize to i64*
-  %4 = sext i32 %block to i64
-  store i64 %4, i64* %LocalWGSize.0
-  %5 = mul i32 %grid, %block
-  %GlobalWGSize = alloca [1 x i64]
-  %GlobalWGSize.0 = bitcast [1 x i64]* %GlobalWGSize to i64*
-  %6 = sext i32 %5 to i64
-  store i64 %6, i64* %GlobalWGSize.0
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 2)
-  %event._Z9FinalLeafjjjjPjmS_mS_mS_m = call i8* @llvm_visc_ocl_executeNode(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m, i32 1, i64* %LocalWGSize.0, i64* %GlobalWGSize.0)
-  call void @llvm_visc_ocl_wait(i8* %graph._Z9FinalLeafjjjjPjmS_mS_mS_m)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 13)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 12)
-  call void @llvm_visc_ocl_free(i8* %0)
-  call void @llvm_visc_ocl_free(i8* %1)
-  call void @llvm_visc_ocl_free(i8* %2)
-  call void @llvm_visc_ocl_free(i8* %3)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 17)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_NVPTX, i32 0)
-  ret %emptyStruct.12 undef
-}
-
-declare i8* @llvm_visc_x86_launch(i8* (i8*)*, i8*)
-
-declare void @llvm_visc_x86_wait(i8*)
-
-declare i8* @llvm_visc_streamLaunch(void (i8*, i8*)*, i8*)
-
-declare void @llvm_visc_streamPush(i8*, i8*)
-
-declare i8* @llvm_visc_streamPop(i8*)
-
-declare void @llvm_visc_streamWait(i8*)
-
-declare i8* @llvm_visc_createBindInBuffer(i8*, i64, i32)
-
-declare i8* @llvm_visc_createBindOutBuffer(i8*, i64)
-
-declare i8* @llvm_visc_createEdgeBuffer(i8*, i64)
-
-declare i8* @llvm_visc_createLastInputBuffer(i8*, i64)
-
-declare void @llvm_visc_createThread(i8*, i8* (i8*)*, i8*)
-
-declare void @llvm_visc_bufferPush(i8*, i64)
-
-declare i64 @llvm_visc_bufferPop(i8*)
-
-declare void @llvm_visc_x86_dstack_push(i32, i32, i32, i32, i32, i32, i32)
-
-declare void @llvm_visc_x86_dstack_pop()
-
-declare i32 @llvm_visc_x86_getDimLimit(i32, i32)
-
-declare i32 @llvm_visc_x86_getDimInstance(i32, i32)
-
-define i8* @LaunchDataflowGraph(i8* %data.addr) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 20)
-  %input.addr = bitcast i8* %data.addr to i32**
-  %input = load i32** %input.addr
-  %nextArg = getelementptr i32** %input.addr, i64 1
-  %bytes_input.addr = bitcast i32** %nextArg to i64*
-  %bytes_input = load i64* %bytes_input.addr
-  %nextArg1 = getelementptr i64* %bytes_input.addr, i64 1
-  %size.addr = bitcast i64* %nextArg1 to i32*
-  %size = load i32* %size.addr
-  %nextArg2 = getelementptr i32* %size.addr, i64 1
-  %minmax.addr = bitcast i32* %nextArg2 to i32**
-  %minmax = load i32** %minmax.addr
-  %nextArg3 = getelementptr i32** %minmax.addr, i64 1
-  %bytes_minmax.addr = bitcast i32** %nextArg3 to i64*
-  %bytes_minmax = load i64* %bytes_minmax.addr
-  %nextArg4 = getelementptr i64* %bytes_minmax.addr, i64 1
-  %block.addr = bitcast i64* %nextArg4 to i32*
-  %block = load i32* %block.addr
-  %nextArg5 = getelementptr i32* %block.addr, i64 1
-  %grid.addr = bitcast i32* %nextArg5 to i32*
-  %grid = load i32* %grid.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 21)
-  %_Z11PrescanRootPjmiS_mii2.output = call %emptyStruct.9 @_Z11PrescanRootPjmiS_mii2(i32* %input, i64 %bytes_input, i32 %size, i32* %minmax, i64 %bytes_minmax, i32 %block, i32 %grid)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 22)
-  %_Z11PrescanRootPjmiS_mii2.output.addr = bitcast i8* %data.addr to %emptyStruct.9*
-  store %emptyStruct.9 %_Z11PrescanRootPjmiS_mii2.output, %emptyStruct.9* %_Z11PrescanRootPjmiS_mii2.output.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 0)
-  ret i8* null
-}
-
-define i8* @LaunchDataflowGraph13(i8* %data.addr) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 20)
-  %input.addr = bitcast i8* %data.addr to %struct.uint2**
-  %input = load %struct.uint2** %input.addr
-  %nextArg = getelementptr %struct.uint2** %input.addr, i64 1
-  %bytes_input.addr = bitcast %struct.uint2** %nextArg to i64*
-  %bytes_input = load i64* %bytes_input.addr
-  %nextArg1 = getelementptr i64* %bytes_input.addr, i64 1
-  %height.addr = bitcast i64* %nextArg1 to i32*
-  %height = load i32* %height.addr
-  %nextArg2 = getelementptr i32* %height.addr, i64 1
-  %width.addr = bitcast i32* %nextArg2 to i32*
-  %width = load i32* %width.addr
-  %nextArg3 = getelementptr i32* %width.addr, i64 1
-  %input_pitch.addr = bitcast i32* %nextArg3 to i32*
-  %input_pitch = load i32* %input_pitch.addr
-  %nextArg4 = getelementptr i32* %input_pitch.addr, i64 1
-  %sm_mappings.addr = bitcast i32* %nextArg4 to %struct.uchar4**
-  %sm_mappings = load %struct.uchar4** %sm_mappings.addr
-  %nextArg5 = getelementptr %struct.uchar4** %sm_mappings.addr, i64 1
-  %bytes_sm_mappings.addr = bitcast %struct.uchar4** %nextArg5 to i64*
-  %bytes_sm_mappings = load i64* %bytes_sm_mappings.addr
-  %nextArg6 = getelementptr i64* %bytes_sm_mappings.addr, i64 1
-  %block.addr = bitcast i64* %nextArg6 to i32*
-  %block = load i32* %block.addr
-  %nextArg7 = getelementptr i32* %block.addr, i64 1
-  %grid.addr = bitcast i32* %nextArg7 to i32*
-  %grid = load i32* %grid.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 21)
-  %_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3.output = call %emptyStruct.10 @_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3(%struct.uint2* %input, i64 %bytes_input, i32 %height, i32 %width, i32 %input_pitch, %struct.uchar4* %sm_mappings, i64 %bytes_sm_mappings, i32 %block, i32 %grid)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 22)
-  %_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3.output.addr = bitcast i8* %data.addr to %emptyStruct.10*
-  store %emptyStruct.10 %_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3.output, %emptyStruct.10* %_Z17IntermediatesRootP5uint2mjjjP6uchar4mii3.output.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 0)
-  ret i8* null
-}
-
-define i8* @LaunchDataflowGraph14(i8* %data.addr) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 20)
-  %sm_mappings.addr = bitcast i8* %data.addr to %struct.uchar4**
-  %sm_mappings = load %struct.uchar4** %sm_mappings.addr
-  %nextArg = getelementptr %struct.uchar4** %sm_mappings.addr, i64 1
-  %bytes_sm_mappings.addr = bitcast %struct.uchar4** %nextArg to i64*
-  %bytes_sm_mappings = load i64* %bytes_sm_mappings.addr
-  %nextArg1 = getelementptr i64* %bytes_sm_mappings.addr, i64 1
-  %num_elements.addr = bitcast i64* %nextArg1 to i32*
-  %num_elements = load i32* %num_elements.addr
-  %nextArg2 = getelementptr i32* %num_elements.addr, i64 1
-  %sm_range_min.addr = bitcast i32* %nextArg2 to i32*
-  %sm_range_min = load i32* %sm_range_min.addr
-  %nextArg3 = getelementptr i32* %sm_range_min.addr, i64 1
-  %sm_range_max.addr = bitcast i32* %nextArg3 to i32*
-  %sm_range_max = load i32* %sm_range_max.addr
-  %nextArg4 = getelementptr i32* %sm_range_max.addr, i64 1
-  %histo_height.addr = bitcast i32* %nextArg4 to i32*
-  %histo_height = load i32* %histo_height.addr
-  %nextArg5 = getelementptr i32* %histo_height.addr, i64 1
-  %histo_width.addr = bitcast i32* %nextArg5 to i32*
-  %histo_width = load i32* %histo_width.addr
-  %nextArg6 = getelementptr i32* %histo_width.addr, i64 1
-  %global_subhisto.addr = bitcast i32* %nextArg6 to i32**
-  %global_subhisto = load i32** %global_subhisto.addr
-  %nextArg7 = getelementptr i32** %global_subhisto.addr, i64 1
-  %bytes_global_subhisto.addr = bitcast i32** %nextArg7 to i64*
-  %bytes_global_subhisto = load i64* %bytes_global_subhisto.addr
-  %nextArg8 = getelementptr i64* %bytes_global_subhisto.addr, i64 1
-  %global_histo.addr = bitcast i64* %nextArg8 to i32**
-  %global_histo = load i32** %global_histo.addr
-  %nextArg9 = getelementptr i32** %global_histo.addr, i64 1
-  %bytes_global_histo.addr = bitcast i32** %nextArg9 to i64*
-  %bytes_global_histo = load i64* %bytes_global_histo.addr
-  %nextArg10 = getelementptr i64* %bytes_global_histo.addr, i64 1
-  %global_overflow.addr = bitcast i64* %nextArg10 to i32**
-  %global_overflow = load i32** %global_overflow.addr
-  %nextArg11 = getelementptr i32** %global_overflow.addr, i64 1
-  %bytes_global_overflow.addr = bitcast i32** %nextArg11 to i64*
-  %bytes_global_overflow = load i64* %bytes_global_overflow.addr
-  %nextArg12 = getelementptr i64* %bytes_global_overflow.addr, i64 1
-  %blockx.addr = bitcast i64* %nextArg12 to i32*
-  %blockx = load i32* %blockx.addr
-  %nextArg13 = getelementptr i32* %blockx.addr, i64 1
-  %blocky.addr = bitcast i32* %nextArg13 to i32*
-  %blocky = load i32* %blocky.addr
-  %nextArg14 = getelementptr i32* %blocky.addr, i64 1
-  %gridx.addr = bitcast i32* %nextArg14 to i32*
-  %gridx = load i32* %gridx.addr
-  %nextArg15 = getelementptr i32* %gridx.addr, i64 1
-  %gridy.addr = bitcast i32* %nextArg15 to i32*
-  %gridy = load i32* %gridy.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 21)
-  %_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7.output = call %emptyStruct.11 @_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7(%struct.uchar4* %sm_mappings, i64 %bytes_sm_mappings, i32 %num_elements, i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32 %blockx, i32 %blocky, i32 %gridx, i32 %gridy)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 22)
-  %_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7.output.addr = bitcast i8* %data.addr to %emptyStruct.11*
-  store %emptyStruct.11 %_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7.output, %emptyStruct.11* %_Z8MainRootP6uchar4mjjjjjPjmS1_mS1_miiii7.output.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 0)
-  ret i8* null
-}
-
-define i8* @LaunchDataflowGraph15(i8* %data.addr) {
-entry:
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 20)
-  %sm_range_min.addr = bitcast i8* %data.addr to i32*
-  %sm_range_min = load i32* %sm_range_min.addr
-  %nextArg = getelementptr i32* %sm_range_min.addr, i64 1
-  %sm_range_max.addr = bitcast i32* %nextArg to i32*
-  %sm_range_max = load i32* %sm_range_max.addr
-  %nextArg1 = getelementptr i32* %sm_range_max.addr, i64 1
-  %histo_height.addr = bitcast i32* %nextArg1 to i32*
-  %histo_height = load i32* %histo_height.addr
-  %nextArg2 = getelementptr i32* %histo_height.addr, i64 1
-  %histo_width.addr = bitcast i32* %nextArg2 to i32*
-  %histo_width = load i32* %histo_width.addr
-  %nextArg3 = getelementptr i32* %histo_width.addr, i64 1
-  %global_subhisto.addr = bitcast i32* %nextArg3 to i32**
-  %global_subhisto = load i32** %global_subhisto.addr
-  %nextArg4 = getelementptr i32** %global_subhisto.addr, i64 1
-  %bytes_global_subhisto.addr = bitcast i32** %nextArg4 to i64*
-  %bytes_global_subhisto = load i64* %bytes_global_subhisto.addr
-  %nextArg5 = getelementptr i64* %bytes_global_subhisto.addr, i64 1
-  %global_histo.addr = bitcast i64* %nextArg5 to i32**
-  %global_histo = load i32** %global_histo.addr
-  %nextArg6 = getelementptr i32** %global_histo.addr, i64 1
-  %bytes_global_histo.addr = bitcast i32** %nextArg6 to i64*
-  %bytes_global_histo = load i64* %bytes_global_histo.addr
-  %nextArg7 = getelementptr i64* %bytes_global_histo.addr, i64 1
-  %global_overflow.addr = bitcast i64* %nextArg7 to i32**
-  %global_overflow = load i32** %global_overflow.addr
-  %nextArg8 = getelementptr i32** %global_overflow.addr, i64 1
-  %bytes_global_overflow.addr = bitcast i32** %nextArg8 to i64*
-  %bytes_global_overflow = load i64* %bytes_global_overflow.addr
-  %nextArg9 = getelementptr i64* %bytes_global_overflow.addr, i64 1
-  %final_histo.addr = bitcast i64* %nextArg9 to i32**
-  %final_histo = load i32** %final_histo.addr
-  %nextArg10 = getelementptr i32** %final_histo.addr, i64 1
-  %bytes_final_histo.addr = bitcast i32** %nextArg10 to i64*
-  %bytes_final_histo = load i64* %bytes_final_histo.addr
-  %nextArg11 = getelementptr i64* %bytes_final_histo.addr, i64 1
-  %block.addr = bitcast i64* %nextArg11 to i32*
-  %block = load i32* %block.addr
-  %nextArg12 = getelementptr i32* %block.addr, i64 1
-  %grid.addr = bitcast i32* %nextArg12 to i32*
-  %grid = load i32* %grid.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 21)
-  %_Z9FinalRootjjjjPjmS_mS_mS_mii10.output = call %emptyStruct.12 @_Z9FinalRootjjjjPjmS_mS_mS_mii10(i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32* %global_subhisto, i64 %bytes_global_subhisto, i32* %global_histo, i64 %bytes_global_histo, i32* %global_overflow, i64 %bytes_global_overflow, i32* %final_histo, i64 %bytes_final_histo, i32 %block, i32 %grid)
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 22)
-  %_Z9FinalRootjjjjPjmS_mS_mS_mii10.output.addr = bitcast i8* %data.addr to %emptyStruct.12*
-  store %emptyStruct.12 %_Z9FinalRootjjjjPjmS_mS_mS_mii10.output, %emptyStruct.12* %_Z9FinalRootjjjjPjmS_mS_mS_mii10.output.addr
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_X86, i32 0)
-  ret i8* null
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind readonly }
-
-!visc_hint_gpu = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
-!visc_hint_spir = !{!10, !11}
-
-!0 = metadata !{%emptyStruct (i32*, i64, i32, i32*, i64, float*, i64, float*, i64)* undef}
-!1 = metadata !{%emptyStruct.2 (i32*, i64, i32, i32*, i64, i32)* undef}
-!2 = metadata !{%emptyStruct.3 (%struct.uint2*, i64, i32, i32, i32, %struct.uchar4*, i64)* undef}
-!3 = metadata !{%emptyStruct.4 (%struct.uint2*, i64, i32, i32, i32, %struct.uchar4*, i64, i32)* undef}
-!4 = metadata !{%emptyStruct.5 (%struct.uchar4*, i64, i32, i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32*, i64)* undef}
-!5 = metadata !{%emptyStruct.6 (%struct.uchar4*, i64, i32, i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32, i32, i32, i32)* undef}
-!6 = metadata !{%emptyStruct.11 (%struct.uchar4*, i64, i32, i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32, i32, i32, i32)* undef}
-!7 = metadata !{%emptyStruct.7 (i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32*, i64)* undef}
-!8 = metadata !{%emptyStruct.8 (i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32*, i64, i32)* undef}
-!9 = metadata !{%emptyStruct.12 (i32, i32, i32, i32, i32*, i64, i32*, i64, i32*, i64, i32*, i64, i32, i32)* undef}
-!10 = metadata !{%emptyStruct.9 (i32*, i64, i32, i32*, i64, i32, i32)* undef}
-!11 = metadata !{%emptyStruct.10 (%struct.uint2*, i64, i32, i32, i32, %struct.uchar4*, i64, i32, i32)* undef}
-!12 = metadata !{metadata !"any pointer", metadata !13}
-!13 = metadata !{metadata !"omnipotent char", metadata !14}
-!14 = metadata !{metadata !"Simple C/C++ TBAA"}
-!15 = metadata !{metadata !"long", metadata !13}
-!16 = metadata !{metadata !"int", metadata !13}
diff --git a/hpvm/test/parboil/benchmarks/histo/histo.nvptx.s b/hpvm/test/parboil/benchmarks/histo/histo.nvptx.s
deleted file mode 100644
index 798e6d2bca..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/histo.nvptx.s
+++ /dev/null
@@ -1,1846 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Wed Feb 12 23:02:37 2014 (1392267757)
-// Driver 331.49
-//
-
-.version 3.0
-.target sm_30, texmode_independent
-.address_size 32
-
-
-.entry histo_prescan_kernel(
-	.param .u32 .ptr .global .align 4 histo_prescan_kernel_param_0,
-	.param .u64 histo_prescan_kernel_param_1,
-	.param .u32 histo_prescan_kernel_param_2,
-	.param .u32 .ptr .global .align 4 histo_prescan_kernel_param_3,
-	.param .u64 histo_prescan_kernel_param_4,
-	.param .u32 .ptr .shared .align 4 histo_prescan_kernel_param_5,
-	.param .u64 histo_prescan_kernel_param_6,
-	.param .u32 .ptr .shared .align 4 histo_prescan_kernel_param_7,
-	.param .u64 histo_prescan_kernel_param_8
-)
-{
-	.reg .f32 	%f<83>;
-	.reg .pred 	%p<13>;
-	.reg .s32 	%r<58>;
-
-
-	ld.param.u32 	%r22, [histo_prescan_kernel_param_2];
-	// inline asm
-	mov.u32 	%r17, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r18, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r19, %envreg0;
-	// inline asm
-	// inline asm
-	mov.u32 	%r20, %ctaid.x;
-	// inline asm
-	add.s32 	%r23, %r20, %r19;
-	// inline asm
-	mov.u32 	%r21, %envreg6;
-	// inline asm
-	div.u32 	%r24, %r22, %r21;
-	mad.lo.s32 	%r56, %r24, %r23, %r17;
-	shr.s32 	%r25, %r24, 31;
-	shr.u32 	%r26, %r25, 29;
-	add.s32 	%r27, %r24, %r26;
-	shr.s32 	%r28, %r27, 3;
-	mad.lo.s32 	%r8, %r24, %r23, %r28;
-	setp.lt.s32 	%p1, %r56, %r8;
-	@%p1 bra 	BB0_2;
-
-	mov.f32 	%f81, 0f00000000;
-	mov.f32 	%f80, %f81;
-	bra.uni 	BB0_5;
-
-BB0_2:
-	mov.f32 	%f81, 0f00000000;
-	mov.u32 	%r54, 0;
-	mov.u32 	%r57, %r56;
-
-BB0_3:
-	shl.b32 	%r30, %r57, 2;
-	ld.param.u32 	%r48, [histo_prescan_kernel_param_0];
-	add.s32 	%r31, %r48, %r30;
-	ld.global.u32 	%r32, [%r31];
-	cvt.rn.f32.u32 	%f13, %r32;
-	add.f32 	%f81, %f81, %f13;
-	add.s32 	%r54, %r54, 1;
-	add.s32 	%r57, %r57, %r18;
-	setp.lt.s32 	%p2, %r57, %r8;
-	@%p2 bra 	BB0_3;
-
-	cvt.rn.f32.u32 	%f80, %r54;
-
-BB0_5:
-	shl.b32 	%r33, %r17, 2;
-	ld.param.u32 	%r51, [histo_prescan_kernel_param_5];
-	add.s32 	%r13, %r51, %r33;
-	div.full.f32 	%f6, %f81, %f80;
-	st.shared.f32 	[%r13], %f6;
-	@%p1 bra 	BB0_7;
-
-	mov.f32 	%f82, 0f00000000;
-	bra.uni 	BB0_9;
-
-BB0_7:
-	mov.f32 	%f82, 0f00000000;
-
-BB0_8:
-	shl.b32 	%r34, %r56, 2;
-	ld.param.u32 	%r47, [histo_prescan_kernel_param_0];
-	add.s32 	%r35, %r47, %r34;
-	ld.global.u32 	%r36, [%r35];
-	cvt.rn.f32.u32 	%f16, %r36;
-	sub.f32 	%f17, %f16, %f6;
-	fma.rn.f32 	%f82, %f17, %f17, %f82;
-	add.s32 	%r56, %r56, %r18;
-	setp.lt.s32 	%p3, %r56, %r8;
-	@%p3 bra 	BB0_8;
-
-BB0_9:
-	div.full.f32 	%f19, %f82, %f80;
-	// inline asm
-	sqrt.approx.f32 	%f18, %f19;
-	// inline asm
-	ld.param.u32 	%r53, [histo_prescan_kernel_param_7];
-	add.s32 	%r16, %r53, %r33;
-	st.shared.f32 	[%r16], %f18;
-	bar.sync 	0;
-	setp.gt.s32 	%p4, %r17, 255;
-	@%p4 bra 	BB0_11;
-
-	ld.shared.f32 	%f20, [%r13];
-	ld.shared.f32 	%f21, [%r13+1024];
-	add.f32 	%f22, %f20, %f21;
-	st.shared.f32 	[%r13], %f22;
-	ld.shared.f32 	%f23, [%r16];
-	ld.shared.f32 	%f24, [%r16+1024];
-	add.f32 	%f25, %f23, %f24;
-	st.shared.f32 	[%r16], %f25;
-
-BB0_11:
-	bar.sync 	0;
-	setp.gt.s32 	%p5, %r17, 127;
-	@%p5 bra 	BB0_13;
-
-	ld.shared.f32 	%f26, [%r13];
-	ld.shared.f32 	%f27, [%r13+512];
-	add.f32 	%f28, %f26, %f27;
-	st.shared.f32 	[%r13], %f28;
-	ld.shared.f32 	%f29, [%r16];
-	ld.shared.f32 	%f30, [%r16+512];
-	add.f32 	%f31, %f29, %f30;
-	st.shared.f32 	[%r16], %f31;
-
-BB0_13:
-	bar.sync 	0;
-	setp.gt.s32 	%p6, %r17, 63;
-	@%p6 bra 	BB0_15;
-
-	ld.shared.f32 	%f32, [%r13];
-	ld.shared.f32 	%f33, [%r13+256];
-	add.f32 	%f34, %f32, %f33;
-	st.shared.f32 	[%r13], %f34;
-	ld.shared.f32 	%f35, [%r16];
-	ld.shared.f32 	%f36, [%r16+256];
-	add.f32 	%f37, %f35, %f36;
-	st.shared.f32 	[%r16], %f37;
-
-BB0_15:
-	bar.sync 	0;
-	setp.gt.s32 	%p7, %r17, 31;
-	@%p7 bra 	BB0_17;
-
-	ld.shared.f32 	%f38, [%r13];
-	ld.shared.f32 	%f39, [%r13+128];
-	add.f32 	%f40, %f38, %f39;
-	st.shared.f32 	[%r13], %f40;
-	ld.shared.f32 	%f41, [%r16];
-	ld.shared.f32 	%f42, [%r16+128];
-	add.f32 	%f43, %f41, %f42;
-	st.shared.f32 	[%r16], %f43;
-
-BB0_17:
-	setp.gt.s32 	%p8, %r17, 15;
-	@%p8 bra 	BB0_19;
-
-	ld.shared.f32 	%f44, [%r13];
-	ld.shared.f32 	%f45, [%r13+64];
-	add.f32 	%f46, %f44, %f45;
-	st.shared.f32 	[%r13], %f46;
-	ld.shared.f32 	%f47, [%r16];
-	ld.shared.f32 	%f48, [%r16+64];
-	add.f32 	%f49, %f47, %f48;
-	st.shared.f32 	[%r16], %f49;
-
-BB0_19:
-	setp.gt.s32 	%p9, %r17, 7;
-	@%p9 bra 	BB0_21;
-
-	ld.shared.f32 	%f50, [%r13];
-	ld.shared.f32 	%f51, [%r13+32];
-	add.f32 	%f52, %f50, %f51;
-	st.shared.f32 	[%r13], %f52;
-	ld.shared.f32 	%f53, [%r16];
-	ld.shared.f32 	%f54, [%r16+32];
-	add.f32 	%f55, %f53, %f54;
-	st.shared.f32 	[%r16], %f55;
-
-BB0_21:
-	setp.gt.s32 	%p10, %r17, 3;
-	@%p10 bra 	BB0_23;
-
-	ld.shared.f32 	%f56, [%r13];
-	ld.shared.f32 	%f57, [%r13+16];
-	add.f32 	%f58, %f56, %f57;
-	st.shared.f32 	[%r13], %f58;
-	ld.shared.f32 	%f59, [%r16];
-	ld.shared.f32 	%f60, [%r16+16];
-	add.f32 	%f61, %f59, %f60;
-	st.shared.f32 	[%r16], %f61;
-
-BB0_23:
-	setp.gt.s32 	%p11, %r17, 1;
-	@%p11 bra 	BB0_25;
-
-	ld.shared.f32 	%f62, [%r13];
-	ld.shared.f32 	%f63, [%r13+8];
-	add.f32 	%f64, %f62, %f63;
-	st.shared.f32 	[%r13], %f64;
-	ld.shared.f32 	%f65, [%r16];
-	ld.shared.f32 	%f66, [%r16+8];
-	add.f32 	%f67, %f65, %f66;
-	st.shared.f32 	[%r16], %f67;
-
-BB0_25:
-	setp.eq.s32 	%p12, %r17, 0;
-	@%p12 bra 	BB0_27;
-
-	ret;
-
-BB0_27:
-	ld.param.u32 	%r50, [histo_prescan_kernel_param_5];
-	ld.shared.f32 	%f68, [%r50];
-	ld.shared.f32 	%f69, [%r50+4];
-	add.f32 	%f70, %f68, %f69;
-	div.full.f32 	%f71, %f70, 0f44000000;
-	ld.param.u32 	%r52, [histo_prescan_kernel_param_7];
-	ld.shared.f32 	%f72, [%r52+4];
-	ld.shared.f32 	%f73, [%r52];
-	add.f32 	%f74, %f73, %f72;
-	div.full.f32 	%f75, %f74, 0f44000000;
-	mov.f32 	%f76, 0f41200000;
-	neg.f32 	%f77, %f75;
-	fma.rn.f32 	%f78, %f77, %f76, %f71;
-	cvt.rzi.u32.f32 	%r38, %f78;
-	mul.hi.u32 	%r39, %r38, -1431655765;
-	shr.u32 	%r40, %r39, 14;
-	ld.param.u32 	%r49, [histo_prescan_kernel_param_3];
-	atom.global.min.u32 	%r41, [%r49], %r40;
-	fma.rn.f32 	%f79, %f75, 0f41200000, %f71;
-	cvt.rzi.u32.f32 	%r42, %f79;
-	mul.hi.u32 	%r43, %r42, -1431655765;
-	shr.u32 	%r44, %r43, 14;
-	add.s32 	%r45, %r49, 4;
-	atom.global.max.u32 	%r46, [%r45], %r44;
-	ret;
-}
-
-.entry calculateBin(
-	.param .u32 calculateBin_param_0,
-	.param .u32 .ptr .global .align 4 calculateBin_param_1
-)
-{
-	.reg .s32 	%r<13>;
-	.reg .s16 	%rc<9>;
-
-
-	ld.param.u32 	%r1, [calculateBin_param_0];
-	ld.param.u32 	%r2, [calculateBin_param_1];
-	shr.u32 	%r3, %r1, 2;
-	cvt.u8.u32 	%rc1, %r3;
-	shr.u32 	%r4, %r1, 10;
-	mul.hi.u32 	%r5, %r4, -1431655765;
-	shr.u32 	%r6, %r5, 4;
-	mul.lo.s32 	%r7, %r6, 24;
-	sub.s32 	%r8, %r4, %r7;
-	cvt.u8.u32 	%rc2, %r8;
-	mul.hi.u32 	%r9, %r1, -1431655765;
-	shr.u32 	%r10, %r9, 14;
-	cvt.u8.u32 	%rc3, %r10;
-	shl.b32 	%r11, %r1, 3;
-	and.b32  	%r12, %r11, 24;
-	cvt.u8.u32 	%rc4, %r12;
-	st.global.v4.u8 	[%r2], {%rc3, %rc2, %rc1, %rc4};
-	ret;
-}
-
-.entry histo_intermediates_kernel(
-	.param .u32 .ptr .global .align 8 histo_intermediates_kernel_param_0,
-	.param .u64 histo_intermediates_kernel_param_1,
-	.param .u32 histo_intermediates_kernel_param_2,
-	.param .u32 histo_intermediates_kernel_param_3,
-	.param .u32 histo_intermediates_kernel_param_4,
-	.param .u32 .ptr .global .align 4 histo_intermediates_kernel_param_5,
-	.param .u64 histo_intermediates_kernel_param_6
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .s32 	%r<520>;
-	.reg .s16 	%rc<257>;
-
-
-	ld.param.u32 	%r57, [histo_intermediates_kernel_param_0];
-	ld.param.u32 	%r1, [histo_intermediates_kernel_param_3];
-	ld.param.u32 	%r2, [histo_intermediates_kernel_param_4];
-	ld.param.u32 	%r3, [histo_intermediates_kernel_param_5];
-	// inline asm
-	mov.u32 	%r53, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r54, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r55, %envreg0;
-	// inline asm
-	// inline asm
-	mov.u32 	%r56, %ctaid.x;
-	// inline asm
-	add.s32 	%r58, %r54, -1;
-	setp.eq.s32 	%p2, %r53, %r58;
-	{
-	.reg .b32 temp;
-	and.b32	 temp, %r1, 1;
-	setp.b32.eq 	 %p3, temp, 1;
-	}
-	and.pred  	%p1, %p2, %p3;
-	add.s32 	%r59, %r56, %r55;
-	mul.lo.s32 	%r60, %r59, %r1;
-	shl.b32 	%r61, %r60, 4;
-	add.s32 	%r62, %r53, %r61;
-	add.s32 	%r63, %r53, %r54;
-	add.s32 	%r4, %r63, %r61;
-	mul.lo.s32 	%r64, %r59, %r2;
-	shl.b32 	%r65, %r64, 4;
-	add.s32 	%r66, %r53, %r65;
-	shl.b32 	%r67, %r66, 3;
-	add.s32 	%r5, %r57, %r67;
-	shl.b32 	%r68, %r4, 2;
-	add.s32 	%r6, %r3, %r68;
-	shl.b32 	%r69, %r62, 2;
-	add.s32 	%r7, %r3, %r69;
-	ld.global.v2.u32 	{%r487, %r488}, [%r5];
-	shr.u32 	%r71, %r487, 2;
-	cvt.u8.u32 	%rc1, %r71;
-	shr.u32 	%r72, %r487, 10;
-	mul.hi.u32 	%r73, %r72, -1431655765;
-	shr.u32 	%r74, %r73, 4;
-	mul.lo.s32 	%r75, %r74, 24;
-	sub.s32 	%r76, %r72, %r75;
-	cvt.u8.u32 	%rc2, %r76;
-	mul.hi.u32 	%r77, %r487, -1431655765;
-	shr.u32 	%r78, %r77, 14;
-	cvt.u8.u32 	%rc3, %r78;
-	shl.b32 	%r79, %r487, 3;
-	and.b32  	%r80, %r79, 24;
-	cvt.u8.u32 	%rc4, %r80;
-	st.global.v4.u8 	[%r7], {%rc3, %rc2, %rc1, %rc4};
-	@%p1 bra 	BB2_2;
-
-	shr.u32 	%r82, %r488, 2;
-	cvt.u8.u32 	%rc5, %r82;
-	shr.u32 	%r83, %r488, 10;
-	mul.hi.u32 	%r84, %r83, -1431655765;
-	shr.u32 	%r85, %r84, 4;
-	mul.lo.s32 	%r86, %r85, 24;
-	sub.s32 	%r87, %r83, %r86;
-	cvt.u8.u32 	%rc6, %r87;
-	mul.hi.u32 	%r88, %r488, -1431655765;
-	shr.u32 	%r89, %r88, 14;
-	cvt.u8.u32 	%rc7, %r89;
-	shl.b32 	%r90, %r488, 3;
-	and.b32  	%r91, %r90, 24;
-	cvt.u8.u32 	%rc8, %r91;
-	st.global.v4.u8 	[%r6], {%rc7, %rc6, %rc5, %rc8};
-
-BB2_2:
-	ld.param.u32 	%r503, [histo_intermediates_kernel_param_3];
-	add.s32 	%r92, %r4, %r503;
-	shl.b32 	%r93, %r92, 2;
-	ld.param.u32 	%r519, [histo_intermediates_kernel_param_5];
-	add.s32 	%r8, %r519, %r93;
-	ld.param.u32 	%r504, [histo_intermediates_kernel_param_4];
-	shl.b32 	%r9, %r504, 3;
-	add.s32 	%r10, %r5, %r9;
-	ld.global.v2.u32 	{%r485, %r486}, [%r10];
-	shr.u32 	%r95, %r485, 2;
-	cvt.u8.u32 	%rc9, %r95;
-	shr.u32 	%r96, %r485, 10;
-	mul.hi.u32 	%r97, %r96, -1431655765;
-	shr.u32 	%r98, %r97, 4;
-	mul.lo.s32 	%r99, %r98, 24;
-	sub.s32 	%r100, %r96, %r99;
-	cvt.u8.u32 	%rc10, %r100;
-	mul.hi.u32 	%r101, %r485, -1431655765;
-	shr.u32 	%r102, %r101, 14;
-	cvt.u8.u32 	%rc11, %r102;
-	shl.b32 	%r103, %r485, 3;
-	and.b32  	%r104, %r103, 24;
-	cvt.u8.u32 	%rc12, %r104;
-	shl.b32 	%r11, %r503, 2;
-	add.s32 	%r12, %r7, %r11;
-	st.global.v4.u8 	[%r12], {%rc11, %rc10, %rc9, %rc12};
-	@%p1 bra 	BB2_4;
-
-	shr.u32 	%r106, %r486, 2;
-	cvt.u8.u32 	%rc13, %r106;
-	shr.u32 	%r107, %r486, 10;
-	mul.hi.u32 	%r108, %r107, -1431655765;
-	shr.u32 	%r109, %r108, 4;
-	mul.lo.s32 	%r110, %r109, 24;
-	sub.s32 	%r111, %r107, %r110;
-	cvt.u8.u32 	%rc14, %r111;
-	mul.hi.u32 	%r112, %r486, -1431655765;
-	shr.u32 	%r113, %r112, 14;
-	cvt.u8.u32 	%rc15, %r113;
-	shl.b32 	%r114, %r486, 3;
-	and.b32  	%r115, %r114, 24;
-	cvt.u8.u32 	%rc16, %r115;
-	st.global.v4.u8 	[%r8], {%rc15, %rc14, %rc13, %rc16};
-
-BB2_4:
-	ld.param.u32 	%r502, [histo_intermediates_kernel_param_3];
-	shl.b32 	%r116, %r502, 1;
-	add.s32 	%r117, %r4, %r116;
-	shl.b32 	%r118, %r117, 2;
-	ld.param.u32 	%r518, [histo_intermediates_kernel_param_5];
-	add.s32 	%r13, %r518, %r118;
-	add.s32 	%r14, %r10, %r9;
-	ld.global.v2.u32 	{%r483, %r484}, [%r14];
-	shr.u32 	%r120, %r483, 2;
-	cvt.u8.u32 	%rc17, %r120;
-	shr.u32 	%r121, %r483, 10;
-	mul.hi.u32 	%r122, %r121, -1431655765;
-	shr.u32 	%r123, %r122, 4;
-	mul.lo.s32 	%r124, %r123, 24;
-	sub.s32 	%r125, %r121, %r124;
-	cvt.u8.u32 	%rc18, %r125;
-	mul.hi.u32 	%r126, %r483, -1431655765;
-	shr.u32 	%r127, %r126, 14;
-	cvt.u8.u32 	%rc19, %r127;
-	shl.b32 	%r128, %r483, 3;
-	and.b32  	%r129, %r128, 24;
-	cvt.u8.u32 	%rc20, %r129;
-	add.s32 	%r15, %r12, %r11;
-	st.global.v4.u8 	[%r15], {%rc19, %rc18, %rc17, %rc20};
-	@%p1 bra 	BB2_6;
-
-	shr.u32 	%r131, %r484, 2;
-	cvt.u8.u32 	%rc21, %r131;
-	shr.u32 	%r132, %r484, 10;
-	mul.hi.u32 	%r133, %r132, -1431655765;
-	shr.u32 	%r134, %r133, 4;
-	mul.lo.s32 	%r135, %r134, 24;
-	sub.s32 	%r136, %r132, %r135;
-	cvt.u8.u32 	%rc22, %r136;
-	mul.hi.u32 	%r137, %r484, -1431655765;
-	shr.u32 	%r138, %r137, 14;
-	cvt.u8.u32 	%rc23, %r138;
-	shl.b32 	%r139, %r484, 3;
-	and.b32  	%r140, %r139, 24;
-	cvt.u8.u32 	%rc24, %r140;
-	st.global.v4.u8 	[%r13], {%rc23, %rc22, %rc21, %rc24};
-
-BB2_6:
-	ld.param.u32 	%r501, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r141, %r501, 3, %r4;
-	shl.b32 	%r142, %r141, 2;
-	ld.param.u32 	%r517, [histo_intermediates_kernel_param_5];
-	add.s32 	%r16, %r517, %r142;
-	add.s32 	%r17, %r14, %r9;
-	ld.global.v2.u32 	{%r481, %r482}, [%r17];
-	shr.u32 	%r144, %r481, 2;
-	cvt.u8.u32 	%rc25, %r144;
-	shr.u32 	%r145, %r481, 10;
-	mul.hi.u32 	%r146, %r145, -1431655765;
-	shr.u32 	%r147, %r146, 4;
-	mul.lo.s32 	%r148, %r147, 24;
-	sub.s32 	%r149, %r145, %r148;
-	cvt.u8.u32 	%rc26, %r149;
-	mul.hi.u32 	%r150, %r481, -1431655765;
-	shr.u32 	%r151, %r150, 14;
-	cvt.u8.u32 	%rc27, %r151;
-	shl.b32 	%r152, %r481, 3;
-	and.b32  	%r153, %r152, 24;
-	cvt.u8.u32 	%rc28, %r153;
-	add.s32 	%r18, %r15, %r11;
-	st.global.v4.u8 	[%r18], {%rc27, %rc26, %rc25, %rc28};
-	@%p1 bra 	BB2_8;
-
-	shr.u32 	%r155, %r482, 2;
-	cvt.u8.u32 	%rc29, %r155;
-	shr.u32 	%r156, %r482, 10;
-	mul.hi.u32 	%r157, %r156, -1431655765;
-	shr.u32 	%r158, %r157, 4;
-	mul.lo.s32 	%r159, %r158, 24;
-	sub.s32 	%r160, %r156, %r159;
-	cvt.u8.u32 	%rc30, %r160;
-	mul.hi.u32 	%r161, %r482, -1431655765;
-	shr.u32 	%r162, %r161, 14;
-	cvt.u8.u32 	%rc31, %r162;
-	shl.b32 	%r163, %r482, 3;
-	and.b32  	%r164, %r163, 24;
-	cvt.u8.u32 	%rc32, %r164;
-	st.global.v4.u8 	[%r16], {%rc31, %rc30, %rc29, %rc32};
-
-BB2_8:
-	ld.param.u32 	%r500, [histo_intermediates_kernel_param_3];
-	shl.b32 	%r165, %r500, 2;
-	add.s32 	%r166, %r4, %r165;
-	shl.b32 	%r167, %r166, 2;
-	ld.param.u32 	%r516, [histo_intermediates_kernel_param_5];
-	add.s32 	%r19, %r516, %r167;
-	add.s32 	%r20, %r17, %r9;
-	ld.global.v2.u32 	{%r479, %r480}, [%r20];
-	shr.u32 	%r169, %r479, 2;
-	cvt.u8.u32 	%rc33, %r169;
-	shr.u32 	%r170, %r479, 10;
-	mul.hi.u32 	%r171, %r170, -1431655765;
-	shr.u32 	%r172, %r171, 4;
-	mul.lo.s32 	%r173, %r172, 24;
-	sub.s32 	%r174, %r170, %r173;
-	cvt.u8.u32 	%rc34, %r174;
-	mul.hi.u32 	%r175, %r479, -1431655765;
-	shr.u32 	%r176, %r175, 14;
-	cvt.u8.u32 	%rc35, %r176;
-	shl.b32 	%r177, %r479, 3;
-	and.b32  	%r178, %r177, 24;
-	cvt.u8.u32 	%rc36, %r178;
-	add.s32 	%r21, %r18, %r11;
-	st.global.v4.u8 	[%r21], {%rc35, %rc34, %rc33, %rc36};
-	@%p1 bra 	BB2_10;
-
-	shr.u32 	%r180, %r480, 2;
-	cvt.u8.u32 	%rc37, %r180;
-	shr.u32 	%r181, %r480, 10;
-	mul.hi.u32 	%r182, %r181, -1431655765;
-	shr.u32 	%r183, %r182, 4;
-	mul.lo.s32 	%r184, %r183, 24;
-	sub.s32 	%r185, %r181, %r184;
-	cvt.u8.u32 	%rc38, %r185;
-	mul.hi.u32 	%r186, %r480, -1431655765;
-	shr.u32 	%r187, %r186, 14;
-	cvt.u8.u32 	%rc39, %r187;
-	shl.b32 	%r188, %r480, 3;
-	and.b32  	%r189, %r188, 24;
-	cvt.u8.u32 	%rc40, %r189;
-	st.global.v4.u8 	[%r19], {%rc39, %rc38, %rc37, %rc40};
-
-BB2_10:
-	ld.param.u32 	%r499, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r190, %r499, 5, %r4;
-	shl.b32 	%r191, %r190, 2;
-	ld.param.u32 	%r515, [histo_intermediates_kernel_param_5];
-	add.s32 	%r22, %r515, %r191;
-	add.s32 	%r23, %r20, %r9;
-	ld.global.v2.u32 	{%r477, %r478}, [%r23];
-	shr.u32 	%r193, %r477, 2;
-	cvt.u8.u32 	%rc41, %r193;
-	shr.u32 	%r194, %r477, 10;
-	mul.hi.u32 	%r195, %r194, -1431655765;
-	shr.u32 	%r196, %r195, 4;
-	mul.lo.s32 	%r197, %r196, 24;
-	sub.s32 	%r198, %r194, %r197;
-	cvt.u8.u32 	%rc42, %r198;
-	mul.hi.u32 	%r199, %r477, -1431655765;
-	shr.u32 	%r200, %r199, 14;
-	cvt.u8.u32 	%rc43, %r200;
-	shl.b32 	%r201, %r477, 3;
-	and.b32  	%r202, %r201, 24;
-	cvt.u8.u32 	%rc44, %r202;
-	add.s32 	%r24, %r21, %r11;
-	st.global.v4.u8 	[%r24], {%rc43, %rc42, %rc41, %rc44};
-	@%p1 bra 	BB2_12;
-
-	shr.u32 	%r204, %r478, 2;
-	cvt.u8.u32 	%rc45, %r204;
-	shr.u32 	%r205, %r478, 10;
-	mul.hi.u32 	%r206, %r205, -1431655765;
-	shr.u32 	%r207, %r206, 4;
-	mul.lo.s32 	%r208, %r207, 24;
-	sub.s32 	%r209, %r205, %r208;
-	cvt.u8.u32 	%rc46, %r209;
-	mul.hi.u32 	%r210, %r478, -1431655765;
-	shr.u32 	%r211, %r210, 14;
-	cvt.u8.u32 	%rc47, %r211;
-	shl.b32 	%r212, %r478, 3;
-	and.b32  	%r213, %r212, 24;
-	cvt.u8.u32 	%rc48, %r213;
-	st.global.v4.u8 	[%r22], {%rc47, %rc46, %rc45, %rc48};
-
-BB2_12:
-	ld.param.u32 	%r498, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r214, %r498, 6, %r4;
-	shl.b32 	%r215, %r214, 2;
-	ld.param.u32 	%r514, [histo_intermediates_kernel_param_5];
-	add.s32 	%r25, %r514, %r215;
-	add.s32 	%r26, %r23, %r9;
-	ld.global.v2.u32 	{%r475, %r476}, [%r26];
-	shr.u32 	%r217, %r475, 2;
-	cvt.u8.u32 	%rc49, %r217;
-	shr.u32 	%r218, %r475, 10;
-	mul.hi.u32 	%r219, %r218, -1431655765;
-	shr.u32 	%r220, %r219, 4;
-	mul.lo.s32 	%r221, %r220, 24;
-	sub.s32 	%r222, %r218, %r221;
-	cvt.u8.u32 	%rc50, %r222;
-	mul.hi.u32 	%r223, %r475, -1431655765;
-	shr.u32 	%r224, %r223, 14;
-	cvt.u8.u32 	%rc51, %r224;
-	shl.b32 	%r225, %r475, 3;
-	and.b32  	%r226, %r225, 24;
-	cvt.u8.u32 	%rc52, %r226;
-	add.s32 	%r27, %r24, %r11;
-	st.global.v4.u8 	[%r27], {%rc51, %rc50, %rc49, %rc52};
-	@%p1 bra 	BB2_14;
-
-	shr.u32 	%r228, %r476, 2;
-	cvt.u8.u32 	%rc53, %r228;
-	shr.u32 	%r229, %r476, 10;
-	mul.hi.u32 	%r230, %r229, -1431655765;
-	shr.u32 	%r231, %r230, 4;
-	mul.lo.s32 	%r232, %r231, 24;
-	sub.s32 	%r233, %r229, %r232;
-	cvt.u8.u32 	%rc54, %r233;
-	mul.hi.u32 	%r234, %r476, -1431655765;
-	shr.u32 	%r235, %r234, 14;
-	cvt.u8.u32 	%rc55, %r235;
-	shl.b32 	%r236, %r476, 3;
-	and.b32  	%r237, %r236, 24;
-	cvt.u8.u32 	%rc56, %r237;
-	st.global.v4.u8 	[%r25], {%rc55, %rc54, %rc53, %rc56};
-
-BB2_14:
-	ld.param.u32 	%r497, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r238, %r497, 7, %r4;
-	shl.b32 	%r239, %r238, 2;
-	ld.param.u32 	%r513, [histo_intermediates_kernel_param_5];
-	add.s32 	%r28, %r513, %r239;
-	add.s32 	%r29, %r26, %r9;
-	ld.global.v2.u32 	{%r473, %r474}, [%r29];
-	shr.u32 	%r241, %r473, 2;
-	cvt.u8.u32 	%rc57, %r241;
-	shr.u32 	%r242, %r473, 10;
-	mul.hi.u32 	%r243, %r242, -1431655765;
-	shr.u32 	%r244, %r243, 4;
-	mul.lo.s32 	%r245, %r244, 24;
-	sub.s32 	%r246, %r242, %r245;
-	cvt.u8.u32 	%rc58, %r246;
-	mul.hi.u32 	%r247, %r473, -1431655765;
-	shr.u32 	%r248, %r247, 14;
-	cvt.u8.u32 	%rc59, %r248;
-	shl.b32 	%r249, %r473, 3;
-	and.b32  	%r250, %r249, 24;
-	cvt.u8.u32 	%rc60, %r250;
-	add.s32 	%r30, %r27, %r11;
-	st.global.v4.u8 	[%r30], {%rc59, %rc58, %rc57, %rc60};
-	@%p1 bra 	BB2_16;
-
-	shr.u32 	%r252, %r474, 2;
-	cvt.u8.u32 	%rc61, %r252;
-	shr.u32 	%r253, %r474, 10;
-	mul.hi.u32 	%r254, %r253, -1431655765;
-	shr.u32 	%r255, %r254, 4;
-	mul.lo.s32 	%r256, %r255, 24;
-	sub.s32 	%r257, %r253, %r256;
-	cvt.u8.u32 	%rc62, %r257;
-	mul.hi.u32 	%r258, %r474, -1431655765;
-	shr.u32 	%r259, %r258, 14;
-	cvt.u8.u32 	%rc63, %r259;
-	shl.b32 	%r260, %r474, 3;
-	and.b32  	%r261, %r260, 24;
-	cvt.u8.u32 	%rc64, %r261;
-	st.global.v4.u8 	[%r28], {%rc63, %rc62, %rc61, %rc64};
-
-BB2_16:
-	ld.param.u32 	%r496, [histo_intermediates_kernel_param_3];
-	shl.b32 	%r262, %r496, 3;
-	add.s32 	%r263, %r4, %r262;
-	shl.b32 	%r264, %r263, 2;
-	ld.param.u32 	%r512, [histo_intermediates_kernel_param_5];
-	add.s32 	%r31, %r512, %r264;
-	add.s32 	%r32, %r29, %r9;
-	ld.global.v2.u32 	{%r471, %r472}, [%r32];
-	shr.u32 	%r266, %r471, 2;
-	cvt.u8.u32 	%rc65, %r266;
-	shr.u32 	%r267, %r471, 10;
-	mul.hi.u32 	%r268, %r267, -1431655765;
-	shr.u32 	%r269, %r268, 4;
-	mul.lo.s32 	%r270, %r269, 24;
-	sub.s32 	%r271, %r267, %r270;
-	cvt.u8.u32 	%rc66, %r271;
-	mul.hi.u32 	%r272, %r471, -1431655765;
-	shr.u32 	%r273, %r272, 14;
-	cvt.u8.u32 	%rc67, %r273;
-	shl.b32 	%r274, %r471, 3;
-	and.b32  	%r275, %r274, 24;
-	cvt.u8.u32 	%rc68, %r275;
-	add.s32 	%r33, %r30, %r11;
-	st.global.v4.u8 	[%r33], {%rc67, %rc66, %rc65, %rc68};
-	@%p1 bra 	BB2_18;
-
-	shr.u32 	%r277, %r472, 2;
-	cvt.u8.u32 	%rc69, %r277;
-	shr.u32 	%r278, %r472, 10;
-	mul.hi.u32 	%r279, %r278, -1431655765;
-	shr.u32 	%r280, %r279, 4;
-	mul.lo.s32 	%r281, %r280, 24;
-	sub.s32 	%r282, %r278, %r281;
-	cvt.u8.u32 	%rc70, %r282;
-	mul.hi.u32 	%r283, %r472, -1431655765;
-	shr.u32 	%r284, %r283, 14;
-	cvt.u8.u32 	%rc71, %r284;
-	shl.b32 	%r285, %r472, 3;
-	and.b32  	%r286, %r285, 24;
-	cvt.u8.u32 	%rc72, %r286;
-	st.global.v4.u8 	[%r31], {%rc71, %rc70, %rc69, %rc72};
-
-BB2_18:
-	ld.param.u32 	%r495, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r287, %r495, 9, %r4;
-	shl.b32 	%r288, %r287, 2;
-	ld.param.u32 	%r511, [histo_intermediates_kernel_param_5];
-	add.s32 	%r34, %r511, %r288;
-	add.s32 	%r35, %r32, %r9;
-	ld.global.v2.u32 	{%r469, %r470}, [%r35];
-	shr.u32 	%r290, %r469, 2;
-	cvt.u8.u32 	%rc73, %r290;
-	shr.u32 	%r291, %r469, 10;
-	mul.hi.u32 	%r292, %r291, -1431655765;
-	shr.u32 	%r293, %r292, 4;
-	mul.lo.s32 	%r294, %r293, 24;
-	sub.s32 	%r295, %r291, %r294;
-	cvt.u8.u32 	%rc74, %r295;
-	mul.hi.u32 	%r296, %r469, -1431655765;
-	shr.u32 	%r297, %r296, 14;
-	cvt.u8.u32 	%rc75, %r297;
-	shl.b32 	%r298, %r469, 3;
-	and.b32  	%r299, %r298, 24;
-	cvt.u8.u32 	%rc76, %r299;
-	add.s32 	%r36, %r33, %r11;
-	st.global.v4.u8 	[%r36], {%rc75, %rc74, %rc73, %rc76};
-	@%p1 bra 	BB2_20;
-
-	shr.u32 	%r301, %r470, 2;
-	cvt.u8.u32 	%rc77, %r301;
-	shr.u32 	%r302, %r470, 10;
-	mul.hi.u32 	%r303, %r302, -1431655765;
-	shr.u32 	%r304, %r303, 4;
-	mul.lo.s32 	%r305, %r304, 24;
-	sub.s32 	%r306, %r302, %r305;
-	cvt.u8.u32 	%rc78, %r306;
-	mul.hi.u32 	%r307, %r470, -1431655765;
-	shr.u32 	%r308, %r307, 14;
-	cvt.u8.u32 	%rc79, %r308;
-	shl.b32 	%r309, %r470, 3;
-	and.b32  	%r310, %r309, 24;
-	cvt.u8.u32 	%rc80, %r310;
-	st.global.v4.u8 	[%r34], {%rc79, %rc78, %rc77, %rc80};
-
-BB2_20:
-	ld.param.u32 	%r494, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r311, %r494, 10, %r4;
-	shl.b32 	%r312, %r311, 2;
-	ld.param.u32 	%r510, [histo_intermediates_kernel_param_5];
-	add.s32 	%r37, %r510, %r312;
-	add.s32 	%r38, %r35, %r9;
-	ld.global.v2.u32 	{%r467, %r468}, [%r38];
-	shr.u32 	%r314, %r467, 2;
-	cvt.u8.u32 	%rc81, %r314;
-	shr.u32 	%r315, %r467, 10;
-	mul.hi.u32 	%r316, %r315, -1431655765;
-	shr.u32 	%r317, %r316, 4;
-	mul.lo.s32 	%r318, %r317, 24;
-	sub.s32 	%r319, %r315, %r318;
-	cvt.u8.u32 	%rc82, %r319;
-	mul.hi.u32 	%r320, %r467, -1431655765;
-	shr.u32 	%r321, %r320, 14;
-	cvt.u8.u32 	%rc83, %r321;
-	shl.b32 	%r322, %r467, 3;
-	and.b32  	%r323, %r322, 24;
-	cvt.u8.u32 	%rc84, %r323;
-	add.s32 	%r39, %r36, %r11;
-	st.global.v4.u8 	[%r39], {%rc83, %rc82, %rc81, %rc84};
-	@%p1 bra 	BB2_22;
-
-	shr.u32 	%r325, %r468, 2;
-	cvt.u8.u32 	%rc85, %r325;
-	shr.u32 	%r326, %r468, 10;
-	mul.hi.u32 	%r327, %r326, -1431655765;
-	shr.u32 	%r328, %r327, 4;
-	mul.lo.s32 	%r329, %r328, 24;
-	sub.s32 	%r330, %r326, %r329;
-	cvt.u8.u32 	%rc86, %r330;
-	mul.hi.u32 	%r331, %r468, -1431655765;
-	shr.u32 	%r332, %r331, 14;
-	cvt.u8.u32 	%rc87, %r332;
-	shl.b32 	%r333, %r468, 3;
-	and.b32  	%r334, %r333, 24;
-	cvt.u8.u32 	%rc88, %r334;
-	st.global.v4.u8 	[%r37], {%rc87, %rc86, %rc85, %rc88};
-
-BB2_22:
-	ld.param.u32 	%r493, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r335, %r493, 11, %r4;
-	shl.b32 	%r336, %r335, 2;
-	ld.param.u32 	%r509, [histo_intermediates_kernel_param_5];
-	add.s32 	%r40, %r509, %r336;
-	add.s32 	%r41, %r38, %r9;
-	ld.global.v2.u32 	{%r465, %r466}, [%r41];
-	shr.u32 	%r338, %r465, 2;
-	cvt.u8.u32 	%rc89, %r338;
-	shr.u32 	%r339, %r465, 10;
-	mul.hi.u32 	%r340, %r339, -1431655765;
-	shr.u32 	%r341, %r340, 4;
-	mul.lo.s32 	%r342, %r341, 24;
-	sub.s32 	%r343, %r339, %r342;
-	cvt.u8.u32 	%rc90, %r343;
-	mul.hi.u32 	%r344, %r465, -1431655765;
-	shr.u32 	%r345, %r344, 14;
-	cvt.u8.u32 	%rc91, %r345;
-	shl.b32 	%r346, %r465, 3;
-	and.b32  	%r347, %r346, 24;
-	cvt.u8.u32 	%rc92, %r347;
-	add.s32 	%r42, %r39, %r11;
-	st.global.v4.u8 	[%r42], {%rc91, %rc90, %rc89, %rc92};
-	@%p1 bra 	BB2_24;
-
-	shr.u32 	%r349, %r466, 2;
-	cvt.u8.u32 	%rc93, %r349;
-	shr.u32 	%r350, %r466, 10;
-	mul.hi.u32 	%r351, %r350, -1431655765;
-	shr.u32 	%r352, %r351, 4;
-	mul.lo.s32 	%r353, %r352, 24;
-	sub.s32 	%r354, %r350, %r353;
-	cvt.u8.u32 	%rc94, %r354;
-	mul.hi.u32 	%r355, %r466, -1431655765;
-	shr.u32 	%r356, %r355, 14;
-	cvt.u8.u32 	%rc95, %r356;
-	shl.b32 	%r357, %r466, 3;
-	and.b32  	%r358, %r357, 24;
-	cvt.u8.u32 	%rc96, %r358;
-	st.global.v4.u8 	[%r40], {%rc95, %rc94, %rc93, %rc96};
-
-BB2_24:
-	ld.param.u32 	%r492, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r359, %r492, 12, %r4;
-	shl.b32 	%r360, %r359, 2;
-	ld.param.u32 	%r508, [histo_intermediates_kernel_param_5];
-	add.s32 	%r43, %r508, %r360;
-	add.s32 	%r44, %r41, %r9;
-	ld.global.v2.u32 	{%r463, %r464}, [%r44];
-	shr.u32 	%r362, %r463, 2;
-	cvt.u8.u32 	%rc97, %r362;
-	shr.u32 	%r363, %r463, 10;
-	mul.hi.u32 	%r364, %r363, -1431655765;
-	shr.u32 	%r365, %r364, 4;
-	mul.lo.s32 	%r366, %r365, 24;
-	sub.s32 	%r367, %r363, %r366;
-	cvt.u8.u32 	%rc98, %r367;
-	mul.hi.u32 	%r368, %r463, -1431655765;
-	shr.u32 	%r369, %r368, 14;
-	cvt.u8.u32 	%rc99, %r369;
-	shl.b32 	%r370, %r463, 3;
-	and.b32  	%r371, %r370, 24;
-	cvt.u8.u32 	%rc100, %r371;
-	add.s32 	%r45, %r42, %r11;
-	st.global.v4.u8 	[%r45], {%rc99, %rc98, %rc97, %rc100};
-	@%p1 bra 	BB2_26;
-
-	shr.u32 	%r373, %r464, 2;
-	cvt.u8.u32 	%rc101, %r373;
-	shr.u32 	%r374, %r464, 10;
-	mul.hi.u32 	%r375, %r374, -1431655765;
-	shr.u32 	%r376, %r375, 4;
-	mul.lo.s32 	%r377, %r376, 24;
-	sub.s32 	%r378, %r374, %r377;
-	cvt.u8.u32 	%rc102, %r378;
-	mul.hi.u32 	%r379, %r464, -1431655765;
-	shr.u32 	%r380, %r379, 14;
-	cvt.u8.u32 	%rc103, %r380;
-	shl.b32 	%r381, %r464, 3;
-	and.b32  	%r382, %r381, 24;
-	cvt.u8.u32 	%rc104, %r382;
-	st.global.v4.u8 	[%r43], {%rc103, %rc102, %rc101, %rc104};
-
-BB2_26:
-	ld.param.u32 	%r491, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r383, %r491, 13, %r4;
-	shl.b32 	%r384, %r383, 2;
-	ld.param.u32 	%r507, [histo_intermediates_kernel_param_5];
-	add.s32 	%r46, %r507, %r384;
-	add.s32 	%r47, %r44, %r9;
-	ld.global.v2.u32 	{%r461, %r462}, [%r47];
-	shr.u32 	%r386, %r461, 2;
-	cvt.u8.u32 	%rc105, %r386;
-	shr.u32 	%r387, %r461, 10;
-	mul.hi.u32 	%r388, %r387, -1431655765;
-	shr.u32 	%r389, %r388, 4;
-	mul.lo.s32 	%r390, %r389, 24;
-	sub.s32 	%r391, %r387, %r390;
-	cvt.u8.u32 	%rc106, %r391;
-	mul.hi.u32 	%r392, %r461, -1431655765;
-	shr.u32 	%r393, %r392, 14;
-	cvt.u8.u32 	%rc107, %r393;
-	shl.b32 	%r394, %r461, 3;
-	and.b32  	%r395, %r394, 24;
-	cvt.u8.u32 	%rc108, %r395;
-	add.s32 	%r48, %r45, %r11;
-	st.global.v4.u8 	[%r48], {%rc107, %rc106, %rc105, %rc108};
-	@%p1 bra 	BB2_28;
-
-	shr.u32 	%r397, %r462, 2;
-	cvt.u8.u32 	%rc109, %r397;
-	shr.u32 	%r398, %r462, 10;
-	mul.hi.u32 	%r399, %r398, -1431655765;
-	shr.u32 	%r400, %r399, 4;
-	mul.lo.s32 	%r401, %r400, 24;
-	sub.s32 	%r402, %r398, %r401;
-	cvt.u8.u32 	%rc110, %r402;
-	mul.hi.u32 	%r403, %r462, -1431655765;
-	shr.u32 	%r404, %r403, 14;
-	cvt.u8.u32 	%rc111, %r404;
-	shl.b32 	%r405, %r462, 3;
-	and.b32  	%r406, %r405, 24;
-	cvt.u8.u32 	%rc112, %r406;
-	st.global.v4.u8 	[%r46], {%rc111, %rc110, %rc109, %rc112};
-
-BB2_28:
-	ld.param.u32 	%r490, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r407, %r490, 14, %r4;
-	shl.b32 	%r408, %r407, 2;
-	ld.param.u32 	%r506, [histo_intermediates_kernel_param_5];
-	add.s32 	%r49, %r506, %r408;
-	add.s32 	%r50, %r47, %r9;
-	ld.global.v2.u32 	{%r459, %r460}, [%r50];
-	shr.u32 	%r410, %r459, 2;
-	cvt.u8.u32 	%rc113, %r410;
-	shr.u32 	%r411, %r459, 10;
-	mul.hi.u32 	%r412, %r411, -1431655765;
-	shr.u32 	%r413, %r412, 4;
-	mul.lo.s32 	%r414, %r413, 24;
-	sub.s32 	%r415, %r411, %r414;
-	cvt.u8.u32 	%rc114, %r415;
-	mul.hi.u32 	%r416, %r459, -1431655765;
-	shr.u32 	%r417, %r416, 14;
-	cvt.u8.u32 	%rc115, %r417;
-	shl.b32 	%r418, %r459, 3;
-	and.b32  	%r419, %r418, 24;
-	cvt.u8.u32 	%rc116, %r419;
-	add.s32 	%r51, %r48, %r11;
-	st.global.v4.u8 	[%r51], {%rc115, %rc114, %rc113, %rc116};
-	@%p1 bra 	BB2_30;
-
-	shr.u32 	%r421, %r460, 2;
-	cvt.u8.u32 	%rc117, %r421;
-	shr.u32 	%r422, %r460, 10;
-	mul.hi.u32 	%r423, %r422, -1431655765;
-	shr.u32 	%r424, %r423, 4;
-	mul.lo.s32 	%r425, %r424, 24;
-	sub.s32 	%r426, %r422, %r425;
-	cvt.u8.u32 	%rc118, %r426;
-	mul.hi.u32 	%r427, %r460, -1431655765;
-	shr.u32 	%r428, %r427, 14;
-	cvt.u8.u32 	%rc119, %r428;
-	shl.b32 	%r429, %r460, 3;
-	and.b32  	%r430, %r429, 24;
-	cvt.u8.u32 	%rc120, %r430;
-	st.global.v4.u8 	[%r49], {%rc119, %rc118, %rc117, %rc120};
-
-BB2_30:
-	ld.param.u32 	%r489, [histo_intermediates_kernel_param_3];
-	mad.lo.s32 	%r431, %r489, 15, %r4;
-	shl.b32 	%r432, %r431, 2;
-	ld.param.u32 	%r505, [histo_intermediates_kernel_param_5];
-	add.s32 	%r52, %r505, %r432;
-	add.s32 	%r433, %r50, %r9;
-	ld.global.v2.u32 	{%r457, %r458}, [%r433];
-	shr.u32 	%r435, %r457, 2;
-	cvt.u8.u32 	%rc121, %r435;
-	shr.u32 	%r436, %r457, 10;
-	mul.hi.u32 	%r437, %r436, -1431655765;
-	shr.u32 	%r438, %r437, 4;
-	mul.lo.s32 	%r439, %r438, 24;
-	sub.s32 	%r440, %r436, %r439;
-	cvt.u8.u32 	%rc122, %r440;
-	mul.hi.u32 	%r441, %r457, -1431655765;
-	shr.u32 	%r442, %r441, 14;
-	cvt.u8.u32 	%rc123, %r442;
-	shl.b32 	%r443, %r457, 3;
-	and.b32  	%r444, %r443, 24;
-	cvt.u8.u32 	%rc124, %r444;
-	add.s32 	%r445, %r51, %r11;
-	st.global.v4.u8 	[%r445], {%rc123, %rc122, %rc121, %rc124};
-	@%p1 bra 	BB2_32;
-
-	shr.u32 	%r447, %r458, 2;
-	cvt.u8.u32 	%rc125, %r447;
-	shr.u32 	%r448, %r458, 10;
-	mul.hi.u32 	%r449, %r448, -1431655765;
-	shr.u32 	%r450, %r449, 4;
-	mul.lo.s32 	%r451, %r450, 24;
-	sub.s32 	%r452, %r448, %r451;
-	cvt.u8.u32 	%rc126, %r452;
-	mul.hi.u32 	%r453, %r458, -1431655765;
-	shr.u32 	%r454, %r453, 14;
-	cvt.u8.u32 	%rc127, %r454;
-	shl.b32 	%r455, %r458, 3;
-	and.b32  	%r456, %r455, 24;
-	cvt.u8.u32 	%rc128, %r456;
-	st.global.v4.u8 	[%r52], {%rc127, %rc126, %rc125, %rc128};
-
-BB2_32:
-	ret;
-}
-
-.entry histo_main_kernel(
-	.param .u32 .ptr .global .align 4 histo_main_kernel_param_0,
-	.param .u64 histo_main_kernel_param_1,
-	.param .u32 histo_main_kernel_param_2,
-	.param .u32 histo_main_kernel_param_3,
-	.param .u32 histo_main_kernel_param_4,
-	.param .u32 histo_main_kernel_param_5,
-	.param .u32 histo_main_kernel_param_6,
-	.param .u32 .ptr .global .align 4 histo_main_kernel_param_7,
-	.param .u64 histo_main_kernel_param_8,
-	.param .u32 .ptr .global .align 4 histo_main_kernel_param_9,
-	.param .u64 histo_main_kernel_param_10,
-	.param .u32 .ptr .global .align 4 histo_main_kernel_param_11,
-	.param .u64 histo_main_kernel_param_12,
-	.param .u32 .ptr .shared .align 4 histo_main_kernel_param_13,
-	.param .u64 histo_main_kernel_param_14
-)
-{
-	.reg .pred 	%p<34>;
-	.reg .s32 	%r<247>;
-	.reg .s16 	%rc<17>;
-
-
-	ld.param.u32 	%r3, [histo_main_kernel_param_3];
-	// inline asm
-	mov.u32 	%r72, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r73, %envreg6;
-	// inline asm
-	// inline asm
-	mov.u32 	%r74, %envreg1;
-	// inline asm
-	// inline asm
-	mov.u32 	%r75, %ctaid.y;
-	// inline asm
-	add.s32 	%r81, %r74, %r3;
-	add.s32 	%r13, %r81, %r75;
-	// inline asm
-	mov.u32 	%r76, %envreg0;
-	// inline asm
-	// inline asm
-	mov.u32 	%r77, %ctaid.x;
-	// inline asm
-	add.s32 	%r82, %r77, %r76;
-	// inline asm
-	mov.u32 	%r78, %tid.x;
-	// inline asm
-	mad.lo.s32 	%r241, %r82, %r72, %r78;
-	// inline asm
-	mov.u32 	%r79, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r80, %ntid.x;
-	// inline asm
-	setp.gt.s32 	%p12, %r79, 6143;
-	mov.u32 	%r236, %r79;
-	@%p12 bra 	BB3_2;
-
-BB3_1:
-	shl.b32 	%r83, %r236, 2;
-	ld.param.u32 	%r235, [histo_main_kernel_param_13];
-	add.s32 	%r84, %r235, %r83;
-	mov.u32 	%r85, 0;
-	st.shared.u32 	[%r84], %r85;
-	add.s32 	%r236, %r236, %r80;
-	setp.lt.s32 	%p13, %r236, 6144;
-	@%p13 bra 	BB3_1;
-
-BB3_2:
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r86, %envreg1;
-	// inline asm
-	// inline asm
-	mov.u32 	%r87, %ctaid.y;
-	// inline asm
-	neg.s32 	%r88, %r87;
-	setp.eq.s32 	%p14, %r86, %r88;
-	ld.param.u32 	%r217, [histo_main_kernel_param_2];
-	setp.lt.u32 	%p1, %r241, %r217;
-	@%p14 bra 	BB3_20;
-
-	@!%p1 bra 	BB3_40;
-
-	mul.lo.s32 	%r19, %r73, %r72;
-	mov.u32 	%r237, %r92;
-	mov.u32 	%r238, %r93;
-	mov.u32 	%r239, %r94;
-	mov.u32 	%r242, %r241;
-
-BB3_5:
-	mov.u32 	%r20, %r242;
-	add.s32 	%r24, %r20, %r19;
-	shl.b32 	%r95, %r20, 2;
-	ld.param.u32 	%r214, [histo_main_kernel_param_0];
-	add.s32 	%r96, %r214, %r95;
-	ld.global.v4.u8 	{%rc13, %rc14, %rc15, %rc16}, [%r96];
-	cvt.u32.u8 	%r25, %rc13;
-	cvt.u32.u8 	%r26, %rc14;
-	cvt.u32.u8 	%r27, %rc15;
-	cvt.u32.u8 	%r28, %rc16;
-	setp.ne.s32 	%p15, %r25, %r13;
-	@%p15 bra 	BB3_19;
-
-	and.b32  	%r97, %r28, 31;
-	mov.u32 	%r98, 1;
-	shl.b32 	%r99, %r98, %r97;
-	shl.b32 	%r100, %r26, 10;
-	ld.param.u32 	%r234, [histo_main_kernel_param_13];
-	add.s32 	%r101, %r234, %r100;
-	shl.b32 	%r102, %r27, 2;
-	add.s32 	%r103, %r101, %r102;
-	atom.shared.add.u32 	%r29, [%r103], %r99;
-	shr.u32 	%r104, %r29, %r97;
-	and.b32  	%r105, %r104, 255;
-	setp.eq.s32 	%p16, %r105, 255;
-	@%p16 bra 	BB3_7;
-	bra.uni 	BB3_19;
-
-BB3_7:
-	mad.lo.s32 	%r107, %r25, 24576, %r100;
-	add.s32 	%r109, %r107, %r102;
-	shr.u32 	%r110, %r28, 3;
-	add.s32 	%r30, %r109, %r110;
-	add.s32 	%r111, %r28, 8;
-	and.b32  	%r112, %r111, 31;
-	shr.u32 	%r113, %r29, %r112;
-	and.b32  	%r114, %r113, 255;
-	add.s32 	%r115, %r28, 16;
-	and.b32  	%r116, %r115, 31;
-	shr.u32 	%r117, %r29, %r116;
-	and.b32  	%r118, %r117, 255;
-	add.s32 	%r119, %r28, 24;
-	and.b32  	%r120, %r119, 31;
-	shr.u32 	%r121, %r29, %r120;
-	and.b32  	%r31, %r121, 255;
-	setp.eq.s32 	%p2, %r114, 255;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc16;
-	mov.b16 	%temp2, 16;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p17, %temp1, %temp2;
-	}
-	and.pred  	%p3, %p17, %p2;
-	setp.eq.s32 	%p4, %r118, 255;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc16;
-	mov.b16 	%temp2, 8;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p18, %temp1, %temp2;
-	}
-	and.pred  	%p5, %p18, %p4;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc16;
-	mov.b16 	%temp2, 24;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p6, %temp1, %temp2;
-	}
-	@%p6 bra 	BB3_8;
-	bra.uni 	BB3_9;
-
-BB3_8:
-	selp.b32 	%r237, 255, -1, %p2;
-
-BB3_9:
-	@%p3 bra 	BB3_10;
-	bra.uni 	BB3_11;
-
-BB3_10:
-	selp.b32 	%r238, 255, -1, %p4;
-
-BB3_11:
-	@%p5 bra 	BB3_12;
-	bra.uni 	BB3_13;
-
-BB3_12:
-	setp.eq.s32 	%p19, %r31, 255;
-	selp.b32 	%r239, 255, -1, %p19;
-
-BB3_13:
-	shl.b32 	%r122, %r30, 2;
-	ld.param.u32 	%r231, [histo_main_kernel_param_11];
-	add.s32 	%r123, %r231, %r122;
-	atom.global.add.u32 	%r124, [%r123], 256;
-	@!%p6 bra 	BB3_15;
-
-	ld.param.u32 	%r230, [histo_main_kernel_param_11];
-	add.s32 	%r126, %r122, %r230;
-	add.s32 	%r127, %r126, 4;
-	atom.global.add.u32 	%r128, [%r127], %r237;
-
-BB3_15:
-	@!%p3 bra 	BB3_17;
-
-	ld.param.u32 	%r229, [histo_main_kernel_param_11];
-	add.s32 	%r130, %r122, %r229;
-	add.s32 	%r131, %r130, 8;
-	atom.global.add.u32 	%r132, [%r131], %r238;
-
-BB3_17:
-	@%p5 bra 	BB3_18;
-	bra.uni 	BB3_19;
-
-BB3_18:
-	ld.param.u32 	%r228, [histo_main_kernel_param_11];
-	add.s32 	%r134, %r122, %r228;
-	add.s32 	%r135, %r134, 12;
-	atom.global.add.u32 	%r136, [%r135], %r239;
-
-BB3_19:
-	ld.param.u32 	%r216, [histo_main_kernel_param_2];
-	setp.lt.u32 	%p20, %r24, %r216;
-	mov.u32 	%r242, %r24;
-	@%p20 bra 	BB3_5;
-	bra.uni 	BB3_40;
-
-BB3_20:
-	@!%p1 bra 	BB3_40;
-
-	mul.lo.s32 	%r41, %r73, %r72;
-	mov.u32 	%r243, %r140;
-	mov.u32 	%r244, %r141;
-	mov.u32 	%r245, %r142;
-
-BB3_22:
-	mov.u32 	%r42, %r241;
-	add.s32 	%r46, %r42, %r41;
-	shl.b32 	%r143, %r42, 2;
-	ld.param.u32 	%r213, [histo_main_kernel_param_0];
-	add.s32 	%r144, %r213, %r143;
-	ld.global.v4.u8 	{%rc9, %rc10, %rc11, %rc12}, [%r144];
-	cvt.u32.u8 	%r47, %rc9;
-	cvt.u32.u8 	%r48, %rc10;
-	cvt.u32.u8 	%r49, %rc11;
-	cvt.u32.u8 	%r50, %rc12;
-	setp.ne.s32 	%p21, %r47, %r13;
-	@%p21 bra 	BB3_36;
-
-	and.b32  	%r145, %r50, 31;
-	mov.u32 	%r146, 1;
-	shl.b32 	%r147, %r146, %r145;
-	shl.b32 	%r148, %r48, 10;
-	ld.param.u32 	%r233, [histo_main_kernel_param_13];
-	add.s32 	%r149, %r233, %r148;
-	shl.b32 	%r150, %r49, 2;
-	add.s32 	%r151, %r149, %r150;
-	atom.shared.add.u32 	%r51, [%r151], %r147;
-	shr.u32 	%r152, %r51, %r145;
-	and.b32  	%r153, %r152, 255;
-	setp.eq.s32 	%p22, %r153, 255;
-	@%p22 bra 	BB3_24;
-	bra.uni 	BB3_36;
-
-BB3_24:
-	mad.lo.s32 	%r155, %r47, 24576, %r148;
-	add.s32 	%r157, %r155, %r150;
-	shr.u32 	%r158, %r50, 3;
-	add.s32 	%r52, %r157, %r158;
-	add.s32 	%r159, %r50, 8;
-	and.b32  	%r160, %r159, 31;
-	shr.u32 	%r161, %r51, %r160;
-	and.b32  	%r162, %r161, 255;
-	add.s32 	%r163, %r50, 16;
-	and.b32  	%r164, %r163, 31;
-	shr.u32 	%r165, %r51, %r164;
-	and.b32  	%r166, %r165, 255;
-	add.s32 	%r167, %r50, 24;
-	and.b32  	%r168, %r167, 31;
-	shr.u32 	%r169, %r51, %r168;
-	and.b32  	%r53, %r169, 255;
-	setp.eq.s32 	%p7, %r162, 255;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc12;
-	mov.b16 	%temp2, 16;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p23, %temp1, %temp2;
-	}
-	and.pred  	%p8, %p23, %p7;
-	setp.eq.s32 	%p9, %r166, 255;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc12;
-	mov.b16 	%temp2, 8;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p24, %temp1, %temp2;
-	}
-	and.pred  	%p10, %p24, %p9;
-	{
-	.reg .u16 	%temp1;
-	.reg .u16 	%temp2;
-	cvt.u16.u8 	%temp1, %rc12;
-	mov.b16 	%temp2, 24;
-	cvt.u16.u8 	%temp2, %temp2;
-	setp.lt.u16 	%p11, %temp1, %temp2;
-	}
-	@%p11 bra 	BB3_25;
-	bra.uni 	BB3_26;
-
-BB3_25:
-	selp.b32 	%r243, 255, -1, %p7;
-
-BB3_26:
-	@%p8 bra 	BB3_27;
-	bra.uni 	BB3_28;
-
-BB3_27:
-	selp.b32 	%r244, 255, -1, %p9;
-
-BB3_28:
-	@%p10 bra 	BB3_29;
-	bra.uni 	BB3_30;
-
-BB3_29:
-	setp.eq.s32 	%p25, %r53, 255;
-	selp.b32 	%r245, 255, -1, %p25;
-
-BB3_30:
-	shl.b32 	%r170, %r52, 2;
-	ld.param.u32 	%r227, [histo_main_kernel_param_11];
-	add.s32 	%r171, %r227, %r170;
-	atom.global.add.u32 	%r172, [%r171], 256;
-	@!%p11 bra 	BB3_32;
-
-	ld.param.u32 	%r226, [histo_main_kernel_param_11];
-	add.s32 	%r174, %r170, %r226;
-	add.s32 	%r175, %r174, 4;
-	atom.global.add.u32 	%r176, [%r175], %r243;
-
-BB3_32:
-	@!%p8 bra 	BB3_34;
-
-	ld.param.u32 	%r225, [histo_main_kernel_param_11];
-	add.s32 	%r178, %r170, %r225;
-	add.s32 	%r179, %r178, 8;
-	atom.global.add.u32 	%r180, [%r179], %r244;
-
-BB3_34:
-	@%p10 bra 	BB3_35;
-	bra.uni 	BB3_36;
-
-BB3_35:
-	ld.param.u32 	%r224, [histo_main_kernel_param_11];
-	add.s32 	%r182, %r170, %r224;
-	add.s32 	%r183, %r182, 12;
-	atom.global.add.u32 	%r184, [%r183], %r245;
-
-BB3_36:
-	ld.param.u32 	%r219, [histo_main_kernel_param_4];
-	setp.gt.u32 	%p26, %r47, %r219;
-	ld.param.u32 	%r218, [histo_main_kernel_param_3];
-	setp.lt.u32 	%p27, %r47, %r218;
-	or.pred  	%p28, %p27, %p26;
-	@!%p28 bra 	BB3_39;
-
-	shl.b32 	%r185, %r48, 10;
-	mad.lo.s32 	%r186, %r47, 24576, %r185;
-	shl.b32 	%r187, %r49, 2;
-	add.s32 	%r188, %r186, %r187;
-	shr.u32 	%r189, %r50, 3;
-	add.s32 	%r190, %r188, %r189;
-	{
-	.reg .b32 temp;
-	and.b32	 temp, %r190, 1;
-	setp.b32.eq 	 %p29, temp, 1;
-	}
-	and.b32  	%r191, %r190, 2147483646;
-	shl.b32 	%r192, %r191, 1;
-	ld.param.u32 	%r223, [histo_main_kernel_param_9];
-	add.s32 	%r63, %r223, %r192;
-	selp.b32 	%r64, 16, 0, %p29;
-	ld.global.u32 	%r193, [%r63];
-	shr.u32 	%r194, %r193, %r64;
-	and.b32  	%r195, %r194, 65535;
-	setp.gt.u32 	%p30, %r195, 254;
-	@%p30 bra 	BB3_39;
-
-	mov.u32 	%r196, 1;
-	shl.b32 	%r197, %r196, %r64;
-	atom.global.add.u32 	%r198, [%r63], %r197;
-
-BB3_39:
-	ld.param.u32 	%r215, [histo_main_kernel_param_2];
-	setp.lt.u32 	%p31, %r46, %r215;
-	mov.u32 	%r241, %r46;
-	@%p31 bra 	BB3_22;
-
-BB3_40:
-	// inline asm
-	mov.u32 	%r199, %envreg0;
-	// inline asm
-	// inline asm
-	mov.u32 	%r200, %ctaid.x;
-	// inline asm
-	add.s32 	%r201, %r200, %r199;
-	ld.param.u32 	%r220, [histo_main_kernel_param_5];
-	ld.param.u32 	%r221, [histo_main_kernel_param_6];
-	mul.lo.s32 	%r202, %r221, %r220;
-	shr.u32 	%r203, %r202, 2;
-	mul.lo.s32 	%r65, %r201, %r203;
-	mul.lo.s32 	%r204, %r13, 24576;
-	shr.u32 	%r66, %r204, 2;
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r205, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r206, %ntid.x;
-	// inline asm
-	setp.gt.s32 	%p32, %r205, 6143;
-	mov.u32 	%r246, %r205;
-	@%p32 bra 	BB3_43;
-
-	add.s32 	%r69, %r65, %r66;
-
-BB3_42:
-	add.s32 	%r207, %r69, %r246;
-	shl.b32 	%r208, %r207, 2;
-	ld.param.u32 	%r222, [histo_main_kernel_param_7];
-	add.s32 	%r209, %r222, %r208;
-	shl.b32 	%r210, %r246, 2;
-	ld.param.u32 	%r232, [histo_main_kernel_param_13];
-	add.s32 	%r211, %r232, %r210;
-	ld.shared.u32 	%r212, [%r211];
-	st.global.u32 	[%r209], %r212;
-	add.s32 	%r246, %r246, %r206;
-	setp.lt.s32 	%p33, %r246, 6144;
-	@%p33 bra 	BB3_42;
-
-BB3_43:
-	ret;
-}
-
-.entry histo_final_kernel(
-	.param .u32 histo_final_kernel_param_0,
-	.param .u32 histo_final_kernel_param_1,
-	.param .u32 histo_final_kernel_param_2,
-	.param .u32 histo_final_kernel_param_3,
-	.param .u32 .ptr .global .align 4 histo_final_kernel_param_4,
-	.param .u64 histo_final_kernel_param_5,
-	.param .u32 .ptr .global .align 4 histo_final_kernel_param_6,
-	.param .u64 histo_final_kernel_param_7,
-	.param .u32 .ptr .global .align 4 histo_final_kernel_param_8,
-	.param .u64 histo_final_kernel_param_9,
-	.param .u32 .ptr .global .align 4 histo_final_kernel_param_10,
-	.param .u64 histo_final_kernel_param_11
-)
-{
-	.reg .s16 	%rs<43>;
-	.reg .pred 	%p<7>;
-	.reg .s32 	%r<302>;
-	.reg .s16 	%rc<25>;
-
-
-	ld.param.u32 	%r36, [histo_final_kernel_param_0];
-	ld.param.u32 	%r37, [histo_final_kernel_param_1];
-	// inline asm
-	mov.u32 	%r31, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r32, %envreg6;
-	// inline asm
-	// inline asm
-	mov.u32 	%r33, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r34, %envreg0;
-	// inline asm
-	// inline asm
-	mov.u32 	%r35, %ctaid.x;
-	// inline asm
-	add.s32 	%r38, %r35, %r34;
-	mad.lo.s32 	%r9, %r38, %r31, %r33;
-	sub.s32 	%r39, %r37, %r36;
-	mad.lo.s32 	%r10, %r39, 24576, 24576;
-	mul.lo.s32 	%r11, %r36, 24576;
-	shr.u32 	%r12, %r11, 2;
-	setp.ge.u32 	%p1, %r9, %r12;
-	@%p1 bra 	BB4_3;
-
-	mul.lo.s32 	%r13, %r32, %r31;
-	mov.u32 	%r299, %r9;
-
-BB4_2:
-	mov.u32 	%r14, %r299;
-	shl.b32 	%r40, %r14, 3;
-	ld.param.u32 	%r294, [histo_final_kernel_param_6];
-	add.s32 	%r41, %r294, %r40;
-	ld.global.v4.u16 	{%rs35, %rs36, %rs37, %rs38}, [%r41];
-	mov.u16 	%rs13, 0;
-	st.global.v4.u16 	[%r41], {%rs13, %rs13, %rs13, %rs13};
-	mov.u16 	%rs12, 255;
-	// inline asm
-	min.u16 	%rs1, %rs35, %rs12;
-	// inline asm
-	// inline asm
-	min.u16 	%rs4, %rs36, %rs12;
-	// inline asm
-	// inline asm
-	min.u16 	%rs7, %rs37, %rs12;
-	// inline asm
-	// inline asm
-	min.u16 	%rs10, %rs38, %rs12;
-	// inline asm
-	cvt.u8.u16 	%rc1, %rs1;
-	cvt.u8.u16 	%rc2, %rs4;
-	cvt.u8.u16 	%rc3, %rs7;
-	cvt.u8.u16 	%rc4, %rs10;
-	shl.b32 	%r42, %r14, 2;
-	ld.param.u32 	%r298, [histo_final_kernel_param_10];
-	add.s32 	%r43, %r298, %r42;
-	st.global.v4.u8 	[%r43], {%rc1, %rc2, %rc3, %rc4};
-	add.s32 	%r15, %r14, %r13;
-	setp.lt.u32 	%p2, %r15, %r12;
-	mov.u32 	%r299, %r15;
-	@%p2 bra 	BB4_2;
-
-BB4_3:
-	add.s32 	%r300, %r9, %r12;
-	add.s32 	%r44, %r10, %r11;
-	shr.u32 	%r17, %r44, 2;
-	setp.ge.u32 	%p3, %r300, %r17;
-	@%p3 bra 	BB4_6;
-
-	ld.param.u32 	%r288, [histo_final_kernel_param_2];
-	ld.param.u32 	%r291, [histo_final_kernel_param_3];
-	mul.lo.s32 	%r45, %r291, %r288;
-	mul.lo.s32 	%r46, %r45, 3;
-	shr.u32 	%r18, %r46, 2;
-	mul.lo.s32 	%r47, %r45, 9;
-	shr.u32 	%r19, %r47, 2;
-	mul.lo.s32 	%r48, %r45, 10;
-	shr.u32 	%r20, %r48, 2;
-	mul.lo.s32 	%r49, %r45, 11;
-	shr.u32 	%r21, %r49, 2;
-	mul.lo.s32 	%r50, %r45, 12;
-	shr.u32 	%r22, %r50, 2;
-	mul.lo.s32 	%r51, %r45, 13;
-	shr.u32 	%r23, %r51, 2;
-
-BB4_5:
-	shl.b32 	%r64, %r300, 4;
-	ld.param.u32 	%r295, [histo_final_kernel_param_8];
-	add.s32 	%r65, %r295, %r64;
-	ld.global.v4.u32 	{%r278, %r279, %r280, %r281}, [%r65];
-	mov.u32 	%r66, 0;
-	st.global.v4.u32 	[%r65], {%r66, %r66, %r66, %r66};
-	shl.b32 	%r67, %r300, 2;
-	ld.param.u32 	%r292, [histo_final_kernel_param_4];
-	add.s32 	%r68, %r292, %r67;
-	ld.global.u32 	%r69, [%r68];
-	and.b32  	%r70, %r69, 255;
-	mov.u32 	%r63, 255;
-	add.s32 	%r72, %r70, %r278;
-	shr.u32 	%r73, %r69, 8;
-	and.b32  	%r74, %r73, 255;
-	add.s32 	%r76, %r74, %r279;
-	shr.u32 	%r77, %r69, 16;
-	and.b32  	%r78, %r77, 255;
-	add.s32 	%r80, %r78, %r280;
-	shr.u32 	%r81, %r69, 24;
-	add.s32 	%r83, %r81, %r281;
-	ld.param.u32 	%r287, [histo_final_kernel_param_2];
-	ld.param.u32 	%r290, [histo_final_kernel_param_3];
-	mul.lo.s32 	%r84, %r290, %r287;
-	shr.u32 	%r85, %r84, 2;
-	add.s32 	%r86, %r85, %r300;
-	shl.b32 	%r87, %r86, 2;
-	add.s32 	%r88, %r292, %r87;
-	ld.global.u32 	%r89, [%r88];
-	and.b32  	%r90, %r89, 255;
-	add.s32 	%r91, %r90, %r72;
-	shr.u32 	%r92, %r89, 8;
-	and.b32  	%r93, %r92, 255;
-	add.s32 	%r94, %r93, %r76;
-	shr.u32 	%r95, %r89, 16;
-	and.b32  	%r96, %r95, 255;
-	add.s32 	%r97, %r96, %r80;
-	shr.u32 	%r98, %r89, 24;
-	add.s32 	%r99, %r98, %r83;
-	shr.u32 	%r100, %r84, 1;
-	add.s32 	%r101, %r100, %r300;
-	shl.b32 	%r102, %r101, 2;
-	add.s32 	%r103, %r292, %r102;
-	ld.global.u32 	%r104, [%r103];
-	and.b32  	%r105, %r104, 255;
-	add.s32 	%r106, %r105, %r91;
-	shr.u32 	%r107, %r104, 8;
-	and.b32  	%r108, %r107, 255;
-	add.s32 	%r109, %r108, %r94;
-	shr.u32 	%r110, %r104, 16;
-	and.b32  	%r111, %r110, 255;
-	add.s32 	%r112, %r111, %r97;
-	shr.u32 	%r113, %r104, 24;
-	add.s32 	%r114, %r113, %r99;
-	add.s32 	%r115, %r18, %r300;
-	shl.b32 	%r116, %r115, 2;
-	add.s32 	%r117, %r292, %r116;
-	ld.global.u32 	%r118, [%r117];
-	and.b32  	%r119, %r118, 255;
-	add.s32 	%r120, %r119, %r106;
-	shr.u32 	%r121, %r118, 8;
-	and.b32  	%r122, %r121, 255;
-	add.s32 	%r123, %r122, %r109;
-	shr.u32 	%r124, %r118, 16;
-	and.b32  	%r125, %r124, 255;
-	add.s32 	%r126, %r125, %r112;
-	shr.u32 	%r127, %r118, 24;
-	add.s32 	%r128, %r127, %r114;
-	mad.lo.s32 	%r129, %r290, %r287, %r300;
-	shl.b32 	%r130, %r129, 2;
-	add.s32 	%r131, %r292, %r130;
-	ld.global.u32 	%r132, [%r131];
-	and.b32  	%r133, %r132, 255;
-	add.s32 	%r134, %r133, %r120;
-	shr.u32 	%r135, %r132, 8;
-	and.b32  	%r136, %r135, 255;
-	add.s32 	%r137, %r136, %r123;
-	shr.u32 	%r138, %r132, 16;
-	and.b32  	%r139, %r138, 255;
-	add.s32 	%r140, %r139, %r126;
-	shr.u32 	%r141, %r132, 24;
-	add.s32 	%r142, %r141, %r128;
-	mul.lo.s32 	%r143, %r84, 5;
-	shr.u32 	%r144, %r143, 2;
-	add.s32 	%r145, %r144, %r300;
-	shl.b32 	%r146, %r145, 2;
-	add.s32 	%r147, %r292, %r146;
-	ld.global.u32 	%r148, [%r147];
-	and.b32  	%r149, %r148, 255;
-	add.s32 	%r150, %r149, %r134;
-	shr.u32 	%r151, %r148, 8;
-	and.b32  	%r152, %r151, 255;
-	add.s32 	%r153, %r152, %r137;
-	shr.u32 	%r154, %r148, 16;
-	and.b32  	%r155, %r154, 255;
-	add.s32 	%r156, %r155, %r140;
-	shr.u32 	%r157, %r148, 24;
-	add.s32 	%r158, %r157, %r142;
-	mul.lo.s32 	%r159, %r84, 6;
-	shr.u32 	%r160, %r159, 2;
-	add.s32 	%r161, %r160, %r300;
-	shl.b32 	%r162, %r161, 2;
-	add.s32 	%r163, %r292, %r162;
-	ld.global.u32 	%r164, [%r163];
-	and.b32  	%r165, %r164, 255;
-	add.s32 	%r166, %r165, %r150;
-	shr.u32 	%r167, %r164, 8;
-	and.b32  	%r168, %r167, 255;
-	add.s32 	%r169, %r168, %r153;
-	shr.u32 	%r170, %r164, 16;
-	and.b32  	%r171, %r170, 255;
-	add.s32 	%r172, %r171, %r156;
-	shr.u32 	%r173, %r164, 24;
-	add.s32 	%r174, %r173, %r158;
-	mul.lo.s32 	%r175, %r84, 7;
-	shr.u32 	%r176, %r175, 2;
-	add.s32 	%r177, %r176, %r300;
-	shl.b32 	%r178, %r177, 2;
-	add.s32 	%r179, %r292, %r178;
-	ld.global.u32 	%r180, [%r179];
-	and.b32  	%r181, %r180, 255;
-	add.s32 	%r182, %r181, %r166;
-	shr.u32 	%r183, %r180, 8;
-	and.b32  	%r184, %r183, 255;
-	add.s32 	%r185, %r184, %r169;
-	shr.u32 	%r186, %r180, 16;
-	and.b32  	%r187, %r186, 255;
-	add.s32 	%r188, %r187, %r172;
-	shr.u32 	%r189, %r180, 24;
-	add.s32 	%r190, %r189, %r174;
-	shl.b32 	%r191, %r84, 1;
-	add.s32 	%r192, %r191, %r300;
-	shl.b32 	%r193, %r192, 2;
-	add.s32 	%r194, %r292, %r193;
-	ld.global.u32 	%r195, [%r194];
-	and.b32  	%r196, %r195, 255;
-	add.s32 	%r197, %r196, %r182;
-	shr.u32 	%r198, %r195, 8;
-	and.b32  	%r199, %r198, 255;
-	add.s32 	%r200, %r199, %r185;
-	shr.u32 	%r201, %r195, 16;
-	and.b32  	%r202, %r201, 255;
-	add.s32 	%r203, %r202, %r188;
-	shr.u32 	%r204, %r195, 24;
-	add.s32 	%r205, %r204, %r190;
-	add.s32 	%r206, %r19, %r300;
-	shl.b32 	%r207, %r206, 2;
-	add.s32 	%r208, %r292, %r207;
-	ld.global.u32 	%r209, [%r208];
-	and.b32  	%r210, %r209, 255;
-	add.s32 	%r211, %r210, %r197;
-	shr.u32 	%r212, %r209, 8;
-	and.b32  	%r213, %r212, 255;
-	add.s32 	%r214, %r213, %r200;
-	shr.u32 	%r215, %r209, 16;
-	and.b32  	%r216, %r215, 255;
-	add.s32 	%r217, %r216, %r203;
-	shr.u32 	%r218, %r209, 24;
-	add.s32 	%r219, %r218, %r205;
-	add.s32 	%r220, %r20, %r300;
-	shl.b32 	%r221, %r220, 2;
-	add.s32 	%r222, %r292, %r221;
-	ld.global.u32 	%r223, [%r222];
-	and.b32  	%r224, %r223, 255;
-	add.s32 	%r225, %r224, %r211;
-	shr.u32 	%r226, %r223, 8;
-	and.b32  	%r227, %r226, 255;
-	add.s32 	%r228, %r227, %r214;
-	shr.u32 	%r229, %r223, 16;
-	and.b32  	%r230, %r229, 255;
-	add.s32 	%r231, %r230, %r217;
-	shr.u32 	%r232, %r223, 24;
-	add.s32 	%r233, %r232, %r219;
-	add.s32 	%r234, %r21, %r300;
-	shl.b32 	%r235, %r234, 2;
-	add.s32 	%r236, %r292, %r235;
-	ld.global.u32 	%r237, [%r236];
-	and.b32  	%r238, %r237, 255;
-	add.s32 	%r239, %r238, %r225;
-	shr.u32 	%r240, %r237, 8;
-	and.b32  	%r241, %r240, 255;
-	add.s32 	%r242, %r241, %r228;
-	shr.u32 	%r243, %r237, 16;
-	and.b32  	%r244, %r243, 255;
-	add.s32 	%r245, %r244, %r231;
-	shr.u32 	%r246, %r237, 24;
-	add.s32 	%r247, %r246, %r233;
-	add.s32 	%r248, %r22, %r300;
-	shl.b32 	%r249, %r248, 2;
-	add.s32 	%r250, %r292, %r249;
-	ld.global.u32 	%r251, [%r250];
-	and.b32  	%r252, %r251, 255;
-	add.s32 	%r253, %r252, %r239;
-	shr.u32 	%r254, %r251, 8;
-	and.b32  	%r255, %r254, 255;
-	add.s32 	%r256, %r255, %r242;
-	shr.u32 	%r257, %r251, 16;
-	and.b32  	%r258, %r257, 255;
-	add.s32 	%r259, %r258, %r245;
-	shr.u32 	%r260, %r251, 24;
-	add.s32 	%r261, %r260, %r247;
-	add.s32 	%r262, %r23, %r300;
-	shl.b32 	%r263, %r262, 2;
-	add.s32 	%r264, %r292, %r263;
-	ld.global.u32 	%r265, [%r264];
-	and.b32  	%r266, %r265, 255;
-	add.s32 	%r53, %r266, %r253;
-	shr.u32 	%r267, %r265, 8;
-	and.b32  	%r268, %r267, 255;
-	add.s32 	%r56, %r268, %r256;
-	shr.u32 	%r269, %r265, 16;
-	and.b32  	%r270, %r269, 255;
-	add.s32 	%r59, %r270, %r259;
-	shr.u32 	%r271, %r265, 24;
-	add.s32 	%r62, %r271, %r261;
-	// inline asm
-	min.u32 	%r52, %r53, %r63;
-	// inline asm
-	// inline asm
-	min.u32 	%r55, %r56, %r63;
-	// inline asm
-	// inline asm
-	min.u32 	%r58, %r59, %r63;
-	// inline asm
-	// inline asm
-	min.u32 	%r61, %r62, %r63;
-	// inline asm
-	cvt.u8.u32 	%rc5, %r52;
-	cvt.u8.u32 	%rc6, %r55;
-	cvt.u8.u32 	%rc7, %r58;
-	cvt.u8.u32 	%rc8, %r61;
-	ld.param.u32 	%r297, [histo_final_kernel_param_10];
-	add.s32 	%r272, %r297, %r67;
-	st.global.v4.u8 	[%r272], {%rc5, %rc6, %rc7, %rc8};
-	mad.lo.s32 	%r300, %r32, %r31, %r300;
-	setp.lt.u32 	%p4, %r300, %r17;
-	@%p4 bra 	BB4_5;
-
-BB4_6:
-	add.s32 	%r301, %r9, %r17;
-	ld.param.u32 	%r286, [histo_final_kernel_param_2];
-	ld.param.u32 	%r289, [histo_final_kernel_param_3];
-	mul.lo.s32 	%r273, %r289, %r286;
-	shr.u32 	%r27, %r273, 2;
-	setp.ge.u32 	%p5, %r301, %r27;
-	@%p5 bra 	BB4_9;
-
-	mul.lo.s32 	%r28, %r32, %r31;
-
-BB4_8:
-	shl.b32 	%r274, %r301, 3;
-	ld.param.u32 	%r293, [histo_final_kernel_param_6];
-	add.s32 	%r275, %r293, %r274;
-	ld.global.v4.u16 	{%rs27, %rs28, %rs29, %rs30}, [%r275];
-	mov.u16 	%rs26, 0;
-	st.global.v4.u16 	[%r275], {%rs26, %rs26, %rs26, %rs26};
-	mov.u16 	%rs25, 255;
-	// inline asm
-	min.u16 	%rs14, %rs27, %rs25;
-	// inline asm
-	// inline asm
-	min.u16 	%rs17, %rs28, %rs25;
-	// inline asm
-	// inline asm
-	min.u16 	%rs20, %rs29, %rs25;
-	// inline asm
-	// inline asm
-	min.u16 	%rs23, %rs30, %rs25;
-	// inline asm
-	cvt.u8.u16 	%rc9, %rs14;
-	cvt.u8.u16 	%rc10, %rs17;
-	cvt.u8.u16 	%rc11, %rs20;
-	cvt.u8.u16 	%rc12, %rs23;
-	shl.b32 	%r276, %r301, 2;
-	ld.param.u32 	%r296, [histo_final_kernel_param_10];
-	add.s32 	%r277, %r296, %r276;
-	st.global.v4.u8 	[%r277], {%rc9, %rc10, %rc11, %rc12};
-	add.s32 	%r301, %r301, %r28;
-	setp.lt.u32 	%p6, %r301, %r27;
-	@%p6 bra 	BB4_8;
-
-BB4_9:
-	ret;
-}
-
-
diff --git a/hpvm/test/parboil/benchmarks/histo/src/base/Makefile b/hpvm/test/parboil/benchmarks/histo/src/base/Makefile
deleted file mode 100644
index 4039f8d90e..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/base/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-LANGUAGE=c
-SRCDIR_OBJS=main.o util.o
diff --git a/hpvm/test/parboil/benchmarks/histo/src/base/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/base/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/base/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/base/main.c b/hpvm/test/parboil/benchmarks/histo/src/base/main.c
deleted file mode 100644
index 3ab5596cd7..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/base/main.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "util.h"
-
-#define UINT8_MAX 255
-
-/******************************************************************************
-* Implementation: Reference
-* Details:
-* This implementations is a scalar, minimally optimized version. The only 
-* optimization, which reduces the number of pointer chasing operations is the 
-* use of a temporary pointer for each row.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  printf("Base implementation of histogramming.\n");
-  printf("Maintained by Nady Obeid <obeid1@ece.uiuc.edu>\n");
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  pb_InitializeTimerSet(&timers);
-  
-  char *inputStr = "Input";
-  char *outputStr = "Output";
-  
-  pb_AddSubTimer(&timers, inputStr, pb_TimerID_IO);
-  pb_AddSubTimer(&timers, outputStr, pb_TimerID_IO);
-  
-  pb_SwitchToSubTimer(&timers, inputStr, pb_TimerID_IO);  
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-  
-  pb_SwitchToSubTimer(&timers, "Input", pb_TimerID_IO);
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  int iter;
-  for (iter = 0; iter < numIterations; iter++){
-    memset(histo,0,histo_height*histo_width*sizeof(unsigned char));
-    unsigned int i;
-    for (i = 0; i < img_width*img_height; ++i) {
-      const unsigned int value = img[i];
-      if (histo[value] < UINT8_MAX) {
-        ++histo[value];
-      }
-    }
-  }
-
-//  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  pb_SwitchToSubTimer(&timers, outputStr, pb_TimerID_IO);
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  free(img);
-  free(histo);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/base/util.c b/hpvm/test/parboil/benchmarks/histo/src/base/util.c
deleted file mode 100644
index 400f1c4a29..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/base/util.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    size_t y, x;
-    for (y = 0; y < height; ++y)
-    {
-        for (x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/base/util.h b/hpvm/test/parboil/benchmarks/histo/src/base/util.h
deleted file mode 100644
index 9827acb018..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/base/util.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/Makefile b/hpvm/test/parboil/benchmarks/histo/src/cuda/Makefile
deleted file mode 100644
index 9e337126a5..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=main.o histo_final.o histo_intermediates.o histo_main.o histo_prescan.o util.o
-APP_CUDACFLAGS=-arch compute_20
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/cuda/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_final.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_final.cu
deleted file mode 100644
index 515193d858..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_final.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "util.h"
-
-/* Combine all the sub-histogram results into one final histogram */
-__global__ void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto,
-    unsigned int *global_histo,
-    unsigned int *global_overflow,
-    unsigned int *final_histo) //final output
-{
-    unsigned int start_offset = threadIdx.x + blockIdx.x * blockDim.x;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDim.x * blockDim.x)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, 255);
-        global_histo_data.y = min (global_histo_data.y, 255);
-        global_histo_data.z = min (global_histo_data.z, 255);
-        global_histo_data.w = min (global_histo_data.w, 255);
-
-        uchar4 final_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDim.x * blockDim.x)
-    {
-        uint4 global_histo_data = ((uint4*)global_overflow)[i];
-        ((uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, 255);
-        internal_histo_data.y = min (internal_histo_data.y, 255);
-        internal_histo_data.z = min (internal_histo_data.z, 255);
-        internal_histo_data.w = min (internal_histo_data.w, 255);
-
-        uchar4 final_histo_data = {
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDim.x * blockDim.x)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, 255);
-        global_histo_data.y = min (global_histo_data.y, 255);
-        global_histo_data.z = min (global_histo_data.z, 255);
-        global_histo_data.w = min (global_histo_data.w, 255);
-
-        uchar4 final_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_intermediates.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_intermediates.cu
deleted file mode 100644
index a82e4c60b4..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_intermediates.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__device__ void calculateBin (
-        const unsigned int bin,
-        uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__global__ void histo_intermediates_kernel (
-        uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        uchar4 *sm_mappings)
-{
-        unsigned int line = UNROLL * blockIdx.x;// 16 is the unroll factor;
-
-        uint2 *load_bin = input + line * input_pitch + threadIdx.x;
-
-        unsigned int store = line * width + threadIdx.x;
-        bool skip = (width % 2) && (threadIdx.x == (blockDim.x - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDim.x]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_main.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_main.cu
deleted file mode 100644
index 584863589c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_main.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__device__ void testIncrementGlobal (
-        unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atomicAdd (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-__device__ void testIncrementLocal (
-        unsigned int *global_overflow,
-        unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atomicAdd (&smem[indexhi][indexlo], add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atomicAdd (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atomicAdd (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atomicAdd (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atomicAdd (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-__device__ void clearMemory (unsigned int smem[KB][256])
-{
-        for (int i = threadIdx.x; i < BINS_PER_BLOCK / 4; i += blockDim.x)
-        {
-                ((unsigned int*)smem)[i] = 0;
-        }
-}
-
-__device__ void copyMemory (unsigned int *dst, unsigned int src[KB][256])
-{
-        for (int i = threadIdx.x; i < BINS_PER_BLOCK / 4; i += blockDim.x)
-        {
-                dst[i] = ((unsigned int*)src)[i];
-        }
-}
-
-__global__ void histo_main_kernel (
-        uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        __shared__ unsigned int sub_histo[KB][256];
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int local_scan_range = sm_range_min + blockIdx.y;
-        unsigned int local_scan_load = blockIdx.x * blockDim.x + threadIdx.x;
-
-        clearMemory (sub_histo);
-        __syncthreads();
-
-        if (blockIdx.y == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDim.x * gridDim.x;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDim.x * gridDim.x;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = blockIdx.x * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);
-
-        __syncthreads();
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_prescan.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_prescan.cu
deleted file mode 100644
index 5d9eb854eb..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/histo_prescan.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__global__ void histo_prescan_kernel (unsigned int* input, int size, unsigned int* minmax)
-{
-    __shared__ float Avg[PRESCAN_THREADS];
-    __shared__ float StdDev[PRESCAN_THREADS];
-
-    int stride = size/gridDim.x;
-    int addr = blockIdx.x*stride+threadIdx.x;
-    int end = blockIdx.x*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDim.x;
-    }
-    avg /= count;
-    Avg[threadIdx.x] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdx.x*stride+threadIdx.x;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDim.x;
-    }
-    stddev /= count;
-    StdDev[threadIdx.x] = sqrtf(stddev);
-
-#define SUM(stride__)\
-if(threadIdx.x < stride__){\
-    Avg[threadIdx.x] += Avg[threadIdx.x+stride__];\
-    StdDev[threadIdx.x] += StdDev[threadIdx.x+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	__syncthreads();
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdx.x == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atomicMin(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atomicMax(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/main.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/main.cu
deleted file mode 100644
index f244131537..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/main.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__global__ void histo_prescan_kernel (
-        unsigned int* input,
-        int size,
-        unsigned int* minmax);
-
-__global__ void histo_main_kernel (
-        uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow);
-
-__global__ void histo_intermediates_kernel (
-        uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        uchar4 *sm_mappings);
-
-__global__ void histo_final_kernel (
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow,
-        unsigned int *final_histo);
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-  
-  char *prescans = "PreScanKernel";
-  char *postpremems = "PostPreMems";
-  char *intermediates = "IntermediatesKernel";
-  char *mains = "MainKernel";
-  char *finals = "FinalKernel";
-
-  pb_InitializeTimerSet(&timers);
-  
-  pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
-  
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  int even_width = ((img_width+1)/2)*2;
-  unsigned int* input;
-  unsigned int* ranges;
-  uchar4* sm_mappings;
-  unsigned int* global_subhisto;
-  unsigned short* global_histo;
-  unsigned int* global_overflow;
-  unsigned char* final_histo;
-
-  cudaMalloc((void**)&input           , even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int));
-  cudaMalloc((void**)&ranges          , 2*sizeof(unsigned int));
-  cudaMalloc((void**)&sm_mappings     , img_width*img_height*sizeof(uchar4));
-  cudaMalloc((void**)&global_subhisto , BLOCK_X*img_width*histo_height*sizeof(unsigned int));
-  cudaMalloc((void**)&global_histo    , img_width*histo_height*sizeof(unsigned short));
-  cudaMalloc((void**)&global_overflow , img_width*histo_height*sizeof(unsigned int));
-  cudaMalloc((void**)&final_histo     , img_width*histo_height*sizeof(unsigned char));
-
-  cudaMemset(final_histo , 0 , img_width*histo_height*sizeof(unsigned char));
-
-  for (int y=0; y < img_height; y++){
-    cudaMemcpy(&(((unsigned int*)input)[y*even_width]),&img[y*img_width],img_width*sizeof(unsigned int), cudaMemcpyHostToDevice);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-  for (int iter = 0; iter < numIterations; iter++) {
-    unsigned int ranges_h[2] = {UINT32_MAX, 0};
-
-    cudaMemcpy(ranges,ranges_h, 2*sizeof(unsigned int), cudaMemcpyHostToDevice);
-    
-    pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
-
-    histo_prescan_kernel<<<dim3(PRESCAN_BLOCKS_X),dim3(PRESCAN_THREADS)>>>((unsigned int*)input, img_height*img_width, ranges);
-    
-    pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
-
-    cudaMemcpy(ranges_h,ranges, 2*sizeof(unsigned int), cudaMemcpyDeviceToHost);
-
-    cudaMemset(global_subhisto,0,img_width*histo_height*sizeof(unsigned int));
-    
-    pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-
-    histo_intermediates_kernel<<<dim3((img_height + UNROLL-1)/UNROLL), dim3((img_width+1)/2)>>>(
-                (uint2*)(input),
-                (unsigned int)img_height,
-                (unsigned int)img_width,
-                (img_width+1)/2,
-                (uchar4*)(sm_mappings)
-    );
-    
-    pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);
-    
-    
-    histo_main_kernel<<<dim3(BLOCK_X, ranges_h[1]-ranges_h[0]+1), dim3(THREADS)>>>(
-                (uchar4*)(sm_mappings),
-                img_height*img_width,
-                ranges_h[0], ranges_h[1],
-                histo_height, histo_width,
-                (unsigned int*)(global_subhisto),
-                (unsigned int*)(global_histo),
-                (unsigned int*)(global_overflow)    
-    );
-    
-    pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);
-    
-    histo_final_kernel<<<dim3(BLOCK_X*3), dim3(512)>>>(
-                ranges_h[0], ranges_h[1],
-                histo_height, histo_width,
-                (unsigned int*)(global_subhisto),
-                (unsigned int*)(global_histo),
-                (unsigned int*)(global_overflow),
-                (unsigned int*)(final_histo)
-    );
-  }
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  cudaMemcpy(histo,final_histo, histo_height*histo_width*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
-  cudaFree(input);
-  cudaFree(ranges);
-  cudaFree(sm_mappings);
-  cudaFree(global_subhisto);
-  cudaFree(global_histo);
-  cudaFree(global_overflow);
-  cudaFree(final_histo);
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  free(img);
-  free(histo);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-  
-  pb_DestroyTimerSet(&timers);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/util.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda/util.cu
deleted file mode 100644
index 266462c936..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/util.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda/util.h b/hpvm/test/parboil/benchmarks/histo/src/cuda/util.h
deleted file mode 100644
index 8673d2bca0..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#define KB                      24
-#define BINS_PER_BLOCK          (KB * 1024)
-#define BLOCK_X         14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#if KB == 24
-        #define THREADS         768
-#elif KB == 48
-        #define THREADS         1024
-#else //KB == 16 or other
-        #define THREADS         512
-#endif
-
-#define UNROLL 16
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/Makefile b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/Makefile
deleted file mode 100644
index 9e337126a5..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=main.o histo_final.o histo_intermediates.o histo_main.o histo_prescan.o util.o
-APP_CUDACFLAGS=-arch compute_20
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/bmp.h
deleted file mode 100644
index 278db4a0f0..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/bmp.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_final.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_final.cu
deleted file mode 100644
index f7563bbd50..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_final.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "util.h"
-
-/* Combine all the sub-histogram results into one final histogram */
-__global__ void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto,
-    unsigned int *global_histo,
-    unsigned int *global_overflow,
-    unsigned int *final_histo) //final output
-{
-    unsigned int start_offset = threadIdx.x + blockIdx.x * blockDim.x;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDim.x * blockDim.x)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, 255);
-        global_histo_data.y = min (global_histo_data.y, 255);
-        global_histo_data.z = min (global_histo_data.z, 255);
-        global_histo_data.w = min (global_histo_data.w, 255);
-
-        uchar4 final_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDim.x * blockDim.x)
-    {
-        uint4 global_histo_data = ((uint4*)global_overflow)[i];
-        ((uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        unsigned int bin4in0 = ((unsigned int*)global_subhisto)[i*4];
-        unsigned int bin4in1 = ((unsigned int*)global_subhisto)[i*4+1];
-        unsigned int bin4in2 = ((unsigned int*)global_subhisto)[i*4+2];
-        unsigned int bin4in3 = ((unsigned int*)global_subhisto)[i*4+3];
-
-        internal_histo_data.x = min (bin4in0, 255);
-        internal_histo_data.y = min (bin4in1, 255);
-        internal_histo_data.z = min (bin4in2, 255);
-        internal_histo_data.w = min (bin4in3, 255);
-
-        uchar4 final_histo_data = {
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDim.x * blockDim.x)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, 255);
-        global_histo_data.y = min (global_histo_data.y, 255);
-        global_histo_data.z = min (global_histo_data.z, 255);
-        global_histo_data.w = min (global_histo_data.w, 255);
-
-        uchar4 final_histo_data = {
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_intermediates.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_intermediates.cu
deleted file mode 100644
index e99338394c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_intermediates.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <stdio.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__device__ void calculateBin (
-        const unsigned int bin,
-        uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__global__ void histo_intermediates_kernel (
-        uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        uchar4 *sm_mappings)
-{
-        unsigned int line = UNROLL * blockIdx.x;// 16 is the unroll factor;
-
-        uint2 *load_bin = input + line * input_pitch + threadIdx.x;
-
-        unsigned int store = line * width + threadIdx.x;
-        bool skip = (width % 2) && (threadIdx.x == (blockDim.x - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDim.x]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_main.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_main.cu
deleted file mode 100644
index 64e70fdc0a..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_main.cu
+++ /dev/null
@@ -1,184 +0,0 @@
-#include <stdio.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__device__ void testIncrementGlobal (
-        unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atomicAdd (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-__device__ void testIncrementLocal (
-        unsigned int *global_overflow,
-        unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atomicAdd (&smem[indexhi][indexlo], add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atomicAdd (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atomicAdd (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atomicAdd (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atomicAdd (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-__device__ void clearMemory (unsigned int smem[KB][256])
-{
-        for (int i = threadIdx.x; i < BINS_PER_BLOCK / 4; i += blockDim.x)
-        {
-                ((unsigned int*)smem)[i] = 0;
-        }
-}
-
-__device__ void copyMemory (unsigned int *dst, unsigned int src[KB][256])
-{
-        for (int i = threadIdx.x; i < BINS_PER_BLOCK/4; i += blockDim.x)
-        {
-                atomicAdd(dst+i*4, (((unsigned int*)src)[i] >> 0) & 0xFF);
-                atomicAdd(dst+i*4+1, (((unsigned int*)src)[i] >> 8) & 0xFF);
-                atomicAdd(dst+i*4+2, (((unsigned int*)src)[i] >> 16) & 0xFF);
-                atomicAdd(dst+i*4+3, (((unsigned int*)src)[i] >> 24) & 0xFF);
-        }
-}
-
-__global__ void histo_main_kernel (
-        uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        __shared__ unsigned int sub_histo[KB][256];
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int local_scan_range = sm_range_min + blockIdx.y;
-        unsigned int local_scan_load = blockIdx.x * blockDim.x + threadIdx.x;
-
-        clearMemory (sub_histo);
-        __syncthreads();
-
-        if (blockIdx.y == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDim.x * gridDim.x;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDim.x * gridDim.x;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = (local_scan_range * BINS_PER_BLOCK);
-
-        __syncthreads();
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_prescan.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_prescan.cu
deleted file mode 100644
index 49e30a9721..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/histo_prescan.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__global__ void histo_prescan_kernel (unsigned int* input, int size, unsigned int* minmax)
-{
-    __shared__ float Avg[PRESCAN_THREADS];
-    __shared__ float StdDev[PRESCAN_THREADS];
-
-    int stride = size/gridDim.x;
-    int addr = blockIdx.x*stride+threadIdx.x;
-    int end = blockIdx.x*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDim.x;
-    }
-    avg /= count;
-    Avg[threadIdx.x] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdx.x*stride+threadIdx.x;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDim.x;
-    }
-    stddev /= count;
-    StdDev[threadIdx.x] = sqrtf(stddev);
-
-#define SUM(stride__)\
-if(threadIdx.x < stride__){\
-    Avg[threadIdx.x] += Avg[threadIdx.x+stride__];\
-    StdDev[threadIdx.x] += StdDev[threadIdx.x+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	__syncthreads();
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdx.x == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atomicMin(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atomicMax(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/main.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/main.cu
deleted file mode 100644
index cdd530b91f..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/main.cu
+++ /dev/null
@@ -1,260 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <cuda.h>
-
-#include "util.h"
-
-__global__ void histo_prescan_kernel (
-        unsigned int* input,
-        int size,
-        unsigned int* minmax);
-
-__global__ void histo_main_kernel (
-        uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow);
-
-__global__ void histo_intermediates_kernel (
-        uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        uchar4 *sm_mappings);
-
-__global__ void histo_final_kernel (
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        unsigned int *global_subhisto,
-        unsigned int *global_histo,
-        unsigned int *global_overflow,
-        unsigned int *final_histo);
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet *timersPtr;
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  timersPtr = (struct pb_TimerSet *) malloc (sizeof(struct pb_TimerSet));
-  
-  
-  //appendDefaultTimerSet(NULL);
-  
-  
-  if (timersPtr == NULL) {
-    fprintf(stderr, "Could not append default timer set!\n");
-    exit(1);
-  }
-  
-  struct pb_TimerSet timers = *timersPtr;
-  
-//  pb_CreateTimer(&timers, "myTimer!", 0);
-  
-  
-  pb_InitializeTimerSet(&timers);
-  
-  pb_AddSubTimer(&timers, "Input", pb_TimerID_IO);
-  pb_AddSubTimer(&timers, "Output", pb_TimerID_IO);
-  
-  char *prescans = "PreScanKernel";
-  char *postpremems = "PostPreMems";
-  char *intermediates = "IntermediatesKernel";
-  char *mains = "MainKernel";
-  char *finals = "FinalKernel";
-  
-  pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
-  
-//  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  pb_SwitchToSubTimer(&timers, "Input", pb_TimerID_IO);
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  int even_width = ((img_width+1)/2)*2;
-  unsigned int* input;
-  unsigned int* ranges;
-  uchar4* sm_mappings;
-  unsigned int* global_subhisto;
-  unsigned short* global_histo;
-  unsigned int* global_overflow;
-  unsigned char* final_histo;
-
-  cudaMalloc((void**)&input           , even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int));
-  cudaMalloc((void**)&ranges          , 2*sizeof(unsigned int));
-  cudaMalloc((void**)&sm_mappings     , img_width*img_height*sizeof(uchar4));
-  cudaMalloc((void**)&global_subhisto , img_width*histo_height*sizeof(unsigned int));
-  cudaMalloc((void**)&global_histo    , img_width*histo_height*sizeof(unsigned short));
-  cudaMalloc((void**)&global_overflow , img_width*histo_height*sizeof(unsigned int));
-  cudaMalloc((void**)&final_histo     , img_width*histo_height*sizeof(unsigned char));
-
-  cudaMemset(final_histo           ,0 , img_width*histo_height*sizeof(unsigned char));
-
-  for (int y=0; y < img_height; y++){
-    cudaMemcpy(&(((unsigned int*)input)[y*even_width]),&img[y*img_width],img_width*sizeof(unsigned int), cudaMemcpyHostToDevice);
-  }
-  
-  //pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-  pb_SwitchToSubTimer(&timers, NULL, pb_TimerID_KERNEL);
-  
-  
-  unsigned int *zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
-  
-
-  for (int iter = 0; iter < numIterations; iter++) {
-    unsigned int ranges_h[2] = {UINT32_MAX, 0};
-
-    cudaMemcpy(ranges,ranges_h, 2*sizeof(unsigned int), cudaMemcpyHostToDevice);
-
-
-    pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
-
-    histo_prescan_kernel<<<dim3(PRESCAN_BLOCKS_X),dim3(PRESCAN_THREADS)>>>((unsigned int*)input, img_height*img_width, ranges);
-
-    pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
-
-    cudaMemcpy(ranges_h,ranges, 2*sizeof(unsigned int), cudaMemcpyDeviceToHost);
-
-    cudaMemcpy(global_subhisto,zeroData, img_width*histo_height*sizeof(unsigned int), cudaMemcpyHostToDevice);
-    //    cudaMemset(global_subhisto,0,img_width*histo_height*sizeof(unsigned int));
-
-    pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-
-    histo_intermediates_kernel<<<dim3((img_height + UNROLL-1)/UNROLL), dim3((img_width+1)/2)>>>(
-                (uint2*)(input),
-                (unsigned int)img_height,
-                (unsigned int)img_width,
-                (img_width+1)/2,
-                (uchar4*)(sm_mappings)
-    );
-    
-    pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);    
-    
-    histo_main_kernel<<<dim3(BLOCK_X, ranges_h[1]-ranges_h[0]+1), dim3(THREADS)>>>(
-                (uchar4*)(sm_mappings),
-                img_height*img_width,
-                ranges_h[0], ranges_h[1],
-                histo_height, histo_width,
-                (unsigned int*)(global_subhisto),
-                (unsigned int*)(global_histo),
-                (unsigned int*)(global_overflow)
-    );
-    
-    pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);    
-
-    histo_final_kernel<<<dim3(BLOCK_X*3), dim3(512)>>>(
-                ranges_h[0], ranges_h[1],
-                histo_height, histo_width,
-                (unsigned int*)(global_subhisto),
-                (unsigned int*)(global_histo),
-                (unsigned int*)(global_overflow),
-                (unsigned int*)(final_histo)
-    );
-  }
-
-  pb_SwitchToSubTimer(&timers, "Output", pb_TimerID_IO);
-  //  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-
-  cudaMemcpy(histo,final_histo, histo_height*histo_width*sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
-  cudaFree(input);
-  cudaFree(ranges);
-  cudaFree(sm_mappings);
-  cudaFree(global_subhisto);
-  cudaFree(global_histo);
-  cudaFree(global_overflow);
-  cudaFree(final_histo);
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  //pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  pb_SwitchToSubTimer(&timers, NULL, pb_TimerID_COMPUTE);
-
-  free(img);
-  free(histo);
-
-  pb_SwitchToSubTimer(&timers, NULL, pb_TimerID_NONE);
-  
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.cu b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.cu
deleted file mode 100644
index 602fc75a4c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.h b/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.h
deleted file mode 100644
index f46227d789..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/cuda_base/util.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#define KB                      8
-#define BINS_PER_BLOCK          (KB * 1024)
-#define BLOCK_X         14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#if KB == 24
-        #define THREADS         768
-#elif KB == 48
-        #define THREADS         1024
-#else //KB == 16 or other
-        #define THREADS         512
-#endif
-
-#define UNROLL 16
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
diff --git a/hpvm/test/parboil/benchmarks/histo/src/omp_base/Makefile b/hpvm/test/parboil/benchmarks/histo/src/omp_base/Makefile
deleted file mode 100644
index 627e89ecda..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/omp_base/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-LANGUAGE=c
-SRCDIR_OBJS=main.o util.o
-APP_CFLAGS=-fopenmp
-APP_LDFLAGS=-lgomp
diff --git a/hpvm/test/parboil/benchmarks/histo/src/omp_base/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/omp_base/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/omp_base/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/histo/src/omp_base/main.c
deleted file mode 100644
index 3d67f4a323..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/omp_base/main.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "util.h"
-
-#define UINT8_MAX 255
-
-/******************************************************************************
-* Implementation: Reference
-* Details:
-* This implementations is a scalar, minimally optimized version. The only 
-* optimization, which reduces the number of pointer chasing operations is the 
-* use of a temporary pointer for each row.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  printf("Base implementation of histogramming.\n");
-  printf("Maintained by Nady Obeid <obeid1@ece.uiuc.edu>\n");
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  pb_InitializeTimerSet(&timers);
-  
-  char *inputStr = "Input";
-  char *outputStr = "Output";
-  
-  pb_AddSubTimer(&timers, inputStr, pb_TimerID_IO);
-  pb_AddSubTimer(&timers, outputStr, pb_TimerID_IO);
-  
-  pb_SwitchToSubTimer(&timers, inputStr, pb_TimerID_IO);  
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-  
-  pb_SwitchToSubTimer(&timers, "Input", pb_TimerID_IO);
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  int iter;
-  for (iter = 0; iter < numIterations; iter++){
-    memset(histo,0,histo_height*histo_width*sizeof(unsigned char));
-    unsigned int i;
-
-#pragma omp parallel for 
-    for (i = 0; i < img_width*img_height; ++i) {
-      const unsigned int value = img[i];
-
-#pragma omp critical
-      if (histo[value] < UINT8_MAX) {
-        ++histo[value];
-      }
-    }
-  }
-
-//  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  pb_SwitchToSubTimer(&timers, outputStr, pb_TimerID_IO);
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  free(img);
-  free(histo);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.c b/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.c
deleted file mode 100644
index 400f1c4a29..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    size_t y, x;
-    for (y = 0; y < height; ++y)
-    {
-        for (x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.h b/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.h
deleted file mode 100644
index 9827acb018..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/omp_base/util.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/Makefile
deleted file mode 100644
index 4295907c0d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=util.o main.o OpenCL_common.o
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.cpp
deleted file mode 100644
index 9bb2c1b5b2..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    //fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      //char devName[128];
-      //OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_NAME, sizeof(char)*128, devName, NULL));
-      //fprintf(stderr, "Device #%d Name: %s\n", id, devName);
-      
-      if (reqDeviceType != NULL) {
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	cl_uint maxMemAlloc = 0;
-	
-	OCL_ERRCK_RETVAL ( clGetDeviceInfo(	device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-
-	
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.h
deleted file mode 100644
index c51800532d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/OpenCL_common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_final.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_final.cl
deleted file mode 100644
index b9aeba2f6d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_final.cl
+++ /dev/null
@@ -1,103 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-/* Combine all the sub-histogram results into one final histogram */
-__kernel void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    __global unsigned int *global_subhisto,
-    __global unsigned int *global_histo,
-    __global unsigned int *global_overflow,
-    __global unsigned int *final_histo) //final output
-{
-    unsigned int blockDimx = get_local_size(0);
-    unsigned int gridDimx = get_num_groups(0);
-    unsigned int start_offset = get_local_id(0) + get_group_id(0) * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((__global uint4*)global_overflow)[i];
-        ((__global uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = (uint4)(
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        unsigned int bin4in0 = ((__global unsigned int*)global_subhisto)[i*4];
-        unsigned int bin4in1 = ((__global unsigned int*)global_subhisto)[i*4+1];
-        unsigned int bin4in2 = ((__global unsigned int*)global_subhisto)[i*4+2];
-        unsigned int bin4in3 = ((__global unsigned int*)global_subhisto)[i*4+3];
-
-        internal_histo_data.x = min (bin4in0, (unsigned int) 255);
-        internal_histo_data.y = min (bin4in1, (unsigned int) 255);
-        internal_histo_data.z = min (bin4in2, (unsigned int) 255);
-        internal_histo_data.w = min (bin4in3, (unsigned int) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_intermediates.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_intermediates.cl
deleted file mode 100644
index fe2ed37e72..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_intermediates.cl
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-__kernel void calculateBin (
-        __const unsigned int bin,
-        __global uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__kernel void histo_intermediates_kernel (
-        __global uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
-
-__kernel void histo_intermediates_kernel_compat (
-        __global uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = input_pitch; //get_local_size(0);
-        
-        int tid2 = get_local_id(0) + get_local_size(0);
-        
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-        __global uint2 *load_bin2 = input + line * input_pitch + tid2;
-
-        unsigned int store = line * width + threadIdxx;
-        unsigned int store2 = line * width + tid2;
-        
-        bool skip = (width % 2) && (threadIdxx == (input_pitch - 1));
-        bool skip2 = (width % 2) && (tid2 == (input_pitch - 1));
-
-        bool does2 = tid2 < input_pitch;
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-                
-                if (does2) {
-                  uint2 bin_val2 = *load_bin2;
-                                
-                  calculateBin (
-                        bin_val2.x,
-                        &sm_mappings[store2]
-                  );
-
-                  if (!skip) calculateBin (
-                        bin_val2.y,
-                        &sm_mappings[store2 + blockDimx]
-                  );
-
-                  load_bin2 += input_pitch;
-                  store2 += width;
-                }
-        }
-        
-        /*
-        if (does2) {
-          #pragma unroll
-          for (int i = 0; i < UNROLL; i++) {
-            uint2 bin_val2 = *load_bin2;
-                                
-            calculateBin (
-                bin_val2.x,
-                &sm_mappings[store2]
-            );
-
-            if (!skip) calculateBin (
-                           bin_val2.y,
-                           &sm_mappings[store2 + blockDimx]
-                        );
-
-            load_bin2 += input_pitch;
-            store2 += width;
-          }
-       }
-     */
-        
-}
-
-
-/*
-__kernel void histo_intermediates_kernel_variable (
-        __global uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch, // = half_width in orig
-        unsigned int stride,
-        __global uchar4 *sm_mappings)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
-*/
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_main.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_main.cl
deleted file mode 100644
index 5c1fe2d482..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_main.cl
+++ /dev/null
@@ -1,193 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-void testIncrementGlobal (
-        __global unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        // Scan for inputs that are outside the central region of histogram
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atom_add (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-void testIncrementLocal (
-        __global unsigned int *global_overflow,
-        __local unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        // Scan for inputs that are inside the central region of histogram 
-        if (range == myRange)
-        {
-                // Atomically increment shared memory 
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atom_add (&smem[indexhi][indexlo], add);
-
-                // Check if current bin overflowed 
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                // If there was an overflow, record it and record if it cascaded into other bins
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atom_add (&global_overflow[bin],  256);
-                        if (overflow_into_bin_plus_1) atom_add (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atom_add (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atom_add (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-void clearMemory (__local unsigned int smem[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK / 4; i += blockDimx)
-        {
-                ((__local unsigned int*)smem)[i] = 0;
-        }
-}
-
-void copyMemory (__global unsigned int *dst, __local unsigned int src[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK/4; i += blockDimx)
-        {
-                atom_add(dst+i*4, (((__local unsigned int*)src)[i] >> 0) & 0xFF );
-                atom_add(dst+i*4+1, (((__local unsigned int*)src)[i] >> 8) & 0xFF);
-                atom_add(dst+i*4+2, (((__local unsigned int*)src)[i] >> 16) & 0xFF);
-                atom_add(dst+i*4+3, (((__local unsigned int*)src)[i] >> 24) & 0xFF);
-                //dst[i] = ((__local unsigned int*)src)[i];
-        }
-}
-
-__kernel void histo_main_kernel (
-        __global uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        __global unsigned int *global_subhisto,
-        __global unsigned int *global_histo,
-        __global unsigned int *global_overflow)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        __local unsigned int sub_histo[KB][256];
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */         
-        unsigned int blockDimx = get_local_size(0);
-        unsigned int gridDimx = get_num_groups(0);
-        unsigned int local_scan_range = sm_range_min + get_group_id(1);
-        unsigned int local_scan_load = get_group_id(0) * blockDimx + get_local_id(0);
-
-        clearMemory (sub_histo);
-        barrier(CLK_LOCAL_MEM_FENCE); //mem_fence(CLK_GLOBAL_MEM_FENCE);//        __syncthreads();
-
-        if (get_group_id(1) == 0 )
-        {
-                // Loop through and scan the input 
-                while (local_scan_load < num_elements)
-                {
-                        // Read buffer 
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        // Check input 
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                // Loop through and scan the input 
-                while (local_scan_load < num_elements)
-                {
-                        // Read buffer 
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        // Check input 
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        // Store sub histogram to global memory 
-        unsigned int store_index = (local_scan_range * BINS_PER_BLOCK);
-
-        barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-        copyMemory (&(global_subhisto[store_index]), sub_histo);        
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_prescan.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_prescan.cl
deleted file mode 100644
index 29768ed8ea..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/histo_prescan.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-
-__kernel void histo_prescan_kernel (__global unsigned int* input, int size, __global unsigned int* minmax)
-{
-
-    __local float Avg[PRESCAN_THREADS];
-    __local float StdDev[PRESCAN_THREADS];
-
-    int threadIdxx = get_local_id(0);
-    int blockDimx = get_local_size(0);
-    int blockIdxx = get_group_id(0);
-    int stride = size/(get_num_groups(0));
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-      avg += input[addr];
-      count++;
-	  addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	barrier(CLK_LOCAL_MEM_FENCE);
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atom_min(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atom_max(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }  
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/main.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/main.cpp
deleted file mode 100644
index e0a34adb03..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/main.cpp
+++ /dev/null
@@ -1,458 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-
-#include "util.h"    
-#include "OpenCL_common.h"
-
-#define DEFAULT_BLOCK_X         14
-#define DEFAULT_PRESCAN_THREADS 512
-#define DEFAULT_PRESCAN_BLOCKS_X    64
-#define DEFAULT_FINAL_THREADS 512
-#define UNROLL 16
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  
-  struct pb_TimerSet timers;
-  
-  char oclOverhead[] = "OCL Overhead";
-  char prescans[] = "PreScanKernel";
-  char postpremems[] = "PostPreMems";
-  char intermediates[] = "IntermediatesKernel";
-  char mains[] = "MainKernel";
-  char finals[] = "FinalKernel";
-
-  pb_InitializeTimerSet(&timers);
-  
-  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL);
-  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
-    
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-  unsigned int lmemKB;
-  unsigned int prescanThreads = DEFAULT_PRESCAN_THREADS;
-  unsigned int prescanBlockX = DEFAULT_PRESCAN_BLOCKS_X;
-  unsigned int blockX = DEFAULT_BLOCK_X;
-  unsigned int nThreads;
-  unsigned int finalThreads = DEFAULT_FINAL_THREADS;
-  unsigned int bins_per_block;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-  cl_device_id clDevice;
-  cl_context clContext;
-  cl_command_queue clCommandQueue;
-  
-  cl_program clProgram[4];
-  
-  cl_kernel histo_prescan_kernel;
-  cl_kernel histo_intermediates_kernel;
-  cl_kernel histo_intermediates_kernel_compat;
-  cl_kernel histo_main_kernel;
-  cl_kernel histo_final_kernel;
-  
-  cl_kernel chosenInterKernel;
-
-  int even_width = ((img_width+1)/2)*2;
-
-  cl_mem input;
-  cl_mem ranges;
-  cl_mem sm_mappings;
-  cl_mem global_subhisto;
-  cl_mem global_histo;
-  cl_mem global_overflow;
-  cl_mem final_histo;
-  
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-  if (deviceFound < 0) {
-    fprintf(stderr, "No suitable device was found\n");
-    exit(1);
-  }
-  
-  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  long unsigned int lmemSize = 0;
-  OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) );
-  
-  cl_uint workItemDimensions;
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
-  
-  size_t workItemSizes[workItemDimensions];
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
-  
-  //printf("LMEM Size: %lu\n", lmemSize);
-  
-  // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming
-  
-  if (lmemSize >= 48*1024) {
-    lmemKB = 48;
-  } else if (lmemSize >= 24*1024) {
-    lmemKB = 24;
-  } else {
-    lmemKB = 8;
-  }
-  
-  bins_per_block = lmemKB * 1024;
-  
-  switch (lmemKB) {
-    case 48: nThreads = 1024; break;
-    case 24: nThreads = 768; break;
-    default: nThreads = 512; break;
-  }
-  
-  if ((workItemSizes[0] < 512) && (workItemSizes[0] >= 256)) {
-    prescanThreads = DEFAULT_PRESCAN_THREADS/2;
-    prescanBlockX = DEFAULT_PRESCAN_BLOCKS_X * 2;
-    blockX = DEFAULT_BLOCK_X * 2;
-    nThreads = 256;
-    finalThreads = DEFAULT_FINAL_THREADS /2;
-  }
-  
-  size_t program_length[4];
-  const char *source_path[4] = { "src/opencl_base/histo_prescan.cl",
-    "src/opencl_base/histo_intermediates.cl", "src/opencl_base/histo_main.cl","src/opencl_base/histo_final.cl"};
-  char *source[4];
-
-  for (int i = 0; i < 4; ++i) {
-    // Dynamically allocate buffer for source
-    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
-    if(!source[i]) {
-      fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-  	
-  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  	free(source[i]);
-  }
-  	
-  	  	  	  	  	  	  	
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D PRESCAN_THREADS=%u\
-                -D KB=%u -D UNROLL=%u\
-                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",
-
-                prescanThreads,
-                lmemKB, UNROLL,
-                bins_per_block, blockX
-            ); 
-  
-  for (int i = 0; i < 4; ++i) {
-    //fprintf(stderr, "Building Program #%d...\n", i);
-    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) );
-       
-          /*
-       char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-       	
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-       */
-  }
-  	
-  histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_intermediates_kernel_compat = clCreateKernel(clProgram[1], "histo_intermediates_kernel_compat", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  
-
-  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
-      even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
-  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  global_histo =    clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-
-  // Must dynamically allocate. Too large for stack
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
-  if (zeroData == NULL) {
-    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
-    exit(1);
-  }
-   
-  for (int y=0; y < img_height; y++){
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
-                          y*even_width*sizeof(unsigned int), // Offset in bytes
-                          img_width*sizeof(unsigned int), // Size of data to write
-                          &img[y*img_width], // Host Source
-                          0, NULL, NULL) );
-  }
-  
- 
-  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  unsigned int img_dim = img_height*img_width;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) );
-
-  unsigned int half_width = (img_width+1)/2;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel_compat, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel_compat, 1, sizeof(unsigned int), &img_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel_compat, 2, sizeof(unsigned int), &img_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel_compat, 3, sizeof(unsigned int), &half_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel_compat, 4, sizeof(cl_mem), (void *)&sm_mappings) );
-  
-  
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) );
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) );
-  
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) );
-
-  size_t prescan_localWS[1] = {prescanThreads};
-  size_t prescan_globalWS[1] = {prescanBlockX*prescan_localWS[0]};
-
-  size_t inter_localWS[1] = { half_width };
-  size_t inter_globalWS[1] = { ((img_height + UNROLL-1)/UNROLL) * inter_localWS[0] };
-  
-  size_t main_localWS[2] = {nThreads, 1};
-  size_t main_globalWS[2] = { blockX * main_localWS[0], 0 };
-  
-  size_t final_localWS[1] = {finalThreads};
-  size_t final_globalWS[1] = {blockX*3 * final_localWS[0]};
-  
-  
-  if (half_width > workItemSizes[0]) {
-    chosenInterKernel = histo_intermediates_kernel_compat;
-    inter_localWS[0] = workItemSizes[0];
-    inter_globalWS[0] = ((img_height + UNROLL-1)/UNROLL) * inter_localWS[0];
-  } else {
-    chosenInterKernel = histo_intermediates_kernel;
-  }
-  
-  
-  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-  for (int iter = 0; iter < numIterations; iter++) {
-    unsigned int ranges_h[2] = {UINT32_MAX, 0};
-    
-    // how about something like
-    // __global__ unsigned int ranges[2];
-    // ...kernel
-    // __shared__ unsigned int s_ranges[2];
-    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
-    // __syncthreads();
-    
-    // Although then removing the blocking cudaMemcpy's might cause something about
-    // concurrent kernel execution.
-    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?
-
-
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to write
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-                          
-
-  pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);                          
-                         
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0,
-                            prescan_globalWS, prescan_localWS, 0, 0, 0) );
-                            
-  pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);                            
-    
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_FALSE, 
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to read
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
-                          0, // Offset in bytes
-                          img_width*histo_height*sizeof(unsigned int), // Size of data to write
-                          zeroData, // Host Source
-                          0, NULL, NULL) );
-                          
-  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, chosenInterKernel /*histo_intermediates_kernel*/, 1, 0,
-                            inter_globalWS, inter_localWS, 0, 0, 0) );              
-
-  main_globalWS[1] = ranges_h[1]-ranges_h[0]+1;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) );
-
-  pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0,
-                            main_globalWS, main_localWS, 0, 0, 0) );
-                            
-  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
-                            final_globalWS, final_localWS, 0, 0, 0) );                           
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
-                          0, // Offset in bytes
-                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
-                          histo, // Host Source
-                          0, NULL, NULL) );                         
-
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel_compat) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  free(zeroData);
-  free(img);
-  free(histo);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-  
-  pb_DestroyTimerSet(&timers);
-
-  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
-  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.cpp
deleted file mode 100644
index 266462c936..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.h
deleted file mode 100644
index 8db501970c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_base/util.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef __HISTO_UTIL_H_
-#define __HISTO_UTIL_H_
-
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/Makefile b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/Makefile
deleted file mode 100644
index 4295907c0d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=util.o main.o OpenCL_common.o
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.cpp
deleted file mode 100644
index 9bb2c1b5b2..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    //fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      //char devName[128];
-      //OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_NAME, sizeof(char)*128, devName, NULL));
-      //fprintf(stderr, "Device #%d Name: %s\n", id, devName);
-      
-      if (reqDeviceType != NULL) {
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	cl_uint maxMemAlloc = 0;
-	
-	OCL_ERRCK_RETVAL ( clGetDeviceInfo(	device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-
-	
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.h
deleted file mode 100644
index c51800532d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/OpenCL_common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_final.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_final.cl
deleted file mode 100644
index f2e582deda..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_final.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-/* Combine all the sub-histogram results into one final histogram */
-__kernel void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    __global unsigned int *global_subhisto,
-    __global unsigned int *global_histo,
-    __global unsigned int *global_overflow,
-    __global unsigned int *final_histo) //final output
-{
-    unsigned int blockDimx = get_local_size(0);
-    unsigned int gridDimx = get_num_groups(0);
-    unsigned int start_offset = get_local_id(0) + get_group_id(0) * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((__global uint4*)global_overflow)[i];
-        ((__global uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = (uint4)(
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((__global unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_intermediates.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_intermediates.cl
deleted file mode 100644
index 509f8dfc05..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_intermediates.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-__kernel void calculateBin (
-        __const unsigned int bin,
-        __global uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__kernel void histo_intermediates_kernel (
-        __global uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_main.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_main.cl
deleted file mode 100644
index 808daf56e6..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_main.cl
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-void testIncrementGlobal (
-        __global unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atom_add (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-void testIncrementLocal (
-        __global unsigned int *global_overflow,
-        __local unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atom_add (&smem[indexhi][indexlo], add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atom_add (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atom_add (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atom_add (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atom_add (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-void clearMemory (__local unsigned int smem[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK / 4; i += blockDimx)
-        {
-                ((__local unsigned int*)smem)[i] = 0;
-        }
-}
-
-void copyMemory (__global unsigned int *dst, __local unsigned int src[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK/4; i += blockDimx)
-        {
-                dst[i] = ((__local unsigned int*)src)[i];
-        }
-}
-
-__kernel void histo_main_kernel (
-        __global uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        __global unsigned int *global_subhisto,
-        __global unsigned int *global_histo,
-        __global unsigned int *global_overflow)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        __local unsigned int sub_histo[KB][256];
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int blockDimx = get_local_size(0);
-        unsigned int gridDimx = get_num_groups(0);
-        unsigned int local_scan_range = sm_range_min + get_group_id(1);
-        unsigned int local_scan_load = get_group_id(0) * blockDimx + get_local_id(0);
-
-        clearMemory (sub_histo);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (get_group_id(1) == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = get_group_id(0) * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_prescan.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_prescan.cl
deleted file mode 100644
index c1f85a5eec..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/histo_prescan.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-
-__kernel void histo_prescan_kernel (__global unsigned int* input, int size, __global unsigned int* minmax)
-{
-
-    __local float Avg[PRESCAN_THREADS];
-    __local float StdDev[PRESCAN_THREADS];
-
-    int threadIdxx = get_local_id(0);
-    int blockDimx = get_local_size(0);
-    int blockIdxx = get_group_id(0);
-    int stride = size/(get_num_groups(0));
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	barrier(CLK_LOCAL_MEM_FENCE);
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atom_min(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atom_max(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }  
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel.cl
deleted file mode 100644
index 19504e6037..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel.cl
+++ /dev/null
@@ -1,468 +0,0 @@
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#define BLOCK_X         14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#define UNROLL 16
-#define KB                  24
-#define BINS_PER_BLOCK      ((KB)*1024)
-
-
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-__kernel void histo_prescan_kernel (__global unsigned int* input, long  b1,
-    int size,
-    __global unsigned int* minmax, long b2,
-    __local float* Avg, long b3,
-    __local float* StdDev, long b4)
-{
-
-    /*__local float Avg[PRESCAN_THREADS];*/
-    /*__local float StdDev[PRESCAN_THREADS];*/
-
-    int threadIdxx = get_local_id(0);
-    int blockDimx = get_local_size(0);
-    int blockIdxx = get_group_id(0);
-    int stride = size/(get_num_groups(0));
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	barrier(CLK_LOCAL_MEM_FENCE);
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atom_min(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atom_max(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }  
-}
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-__kernel void calculateBin (
-        __const unsigned int bin,
-        __global uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__kernel void histo_intermediates_kernel (
-        __global uint2 *input, long b1,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings, long b2)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#define smem(x,y)   smem[(x)*256 + (y)]
-
-void testIncrementGlobal (
-        __global unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atom_add (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-void testIncrementLocal (
-        __global unsigned int *global_overflow,
-        __local unsigned int* smem,
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atom_add (&smem(indexhi, indexlo), add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atom_add (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atom_add (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atom_add (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atom_add (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-void clearMemory (__local unsigned int* smem)
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK / 4; i += blockDimx)
-        {
-                ((__local unsigned int*)smem)[i] = 0;
-        }
-}
-
-void copyMemory (__global unsigned int *dst, __local unsigned int* src)
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK/4; i += blockDimx)
-        {
-                dst[i] = ((__local unsigned int*)src)[i];
-        }
-}
-
-#define sub_histo(x,y) sub_histo[(x)*256+(y)]
-__kernel void histo_main_kernel (
-        __global uchar4 *sm_mappings, long b1,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        __global unsigned int *global_subhisto, long b3,
-        __global unsigned int *global_histo, long b4,
-        __global unsigned int *global_overflow, long b5,
-        __local unsigned int* sub_histo, long b6)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        /*__local unsigned int sub_histo[KB][256];*/
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int blockDimx = get_local_size(0);
-        unsigned int gridDimx = get_num_groups(0);
-        unsigned int local_scan_range = sm_range_min + get_group_id(1);
-        unsigned int local_scan_load = get_group_id(0) * blockDimx + get_local_id(0);
-
-        clearMemory (sub_histo);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (get_group_id(1) == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = get_group_id(0) * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
-
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-/* Combine all the sub-histogram results into one final histogram */
-__kernel void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    __global unsigned int *global_subhisto, long b1,
-    __global unsigned int *global_histo, long b2,
-    __global unsigned int *global_overflow, long b3,
-    __global unsigned int *final_histo, long b4) //final output
-{
-    unsigned int blockDimx = get_local_size(0);
-    unsigned int gridDimx = get_num_groups(0);
-    unsigned int start_offset = get_local_id(0) + get_group_id(0) * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((__global uint4*)global_overflow)[i];
-        ((__global uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = (uint4)(
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((__global unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.ll b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.ll
deleted file mode 100644
index 3ef1d49f03..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.ll
+++ /dev/null
@@ -1,884 +0,0 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/histo/src/opencl_nvidia/kernel.cl'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-target triple = "spir64-unknown-unknown"
-
-define cc76 void @histo_prescan_kernel(i32 addrspace(1)* nocapture %input, i64 %b1, i32 %size, i32 addrspace(1)* %minmax, i64 %b2, float addrspace(3)* nocapture %Avg, i64 %b3, float addrspace(3)* nocapture %StdDev, i64 %b4) nounwind {
-  %1 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
-  %2 = trunc i64 %1 to i32
-  %3 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
-  %4 = trunc i64 %3 to i32
-  %5 = tail call cc75 i64 @_Z12get_group_idj(i32 0) nounwind readnone
-  %6 = trunc i64 %5 to i32
-  %7 = sext i32 %size to i64
-  %8 = tail call cc75 i64 @_Z14get_num_groupsj(i32 0) nounwind readnone
-  %9 = udiv i64 %7, %8
-  %10 = trunc i64 %9 to i32
-  %11 = mul nsw i32 %10, %6
-  %12 = add nsw i32 %11, %2
-  %13 = sdiv i32 %10, 8
-  %14 = add nsw i32 %11, %13
-  %15 = icmp slt i32 %12, %14
-  br i1 %15, label %.lr.ph103, label %.thread
-
-.thread:                                          ; preds = %0
-  %16 = sext i32 %2 to i64
-  %17 = getelementptr inbounds float addrspace(3)* %Avg, i64 %16
-  store float 0x7FF8000000000000, float addrspace(3)* %17, align 4, !tbaa !37
-  br label %._crit_edge
-
-.lr.ph103:                                        ; preds = %0, %.lr.ph103
-  %addr.0101 = phi i32 [ %24, %.lr.ph103 ], [ %12, %0 ]
-  %avg.0100 = phi float [ %22, %.lr.ph103 ], [ 0.000000e+00, %0 ]
-  %count.099 = phi i32 [ %23, %.lr.ph103 ], [ 0, %0 ]
-  %18 = sext i32 %addr.0101 to i64
-  %19 = getelementptr inbounds i32 addrspace(1)* %input, i64 %18
-  %20 = load i32 addrspace(1)* %19, align 4, !tbaa !40
-  %21 = uitofp i32 %20 to float
-  %22 = fadd float %avg.0100, %21
-  %23 = add i32 %count.099, 1
-  %24 = add nsw i32 %addr.0101, %4
-  %25 = icmp slt i32 %24, %14
-  br i1 %25, label %.lr.ph103, label %26
-
-; <label>:26                                      ; preds = %.lr.ph103
-  %phitmp = uitofp i32 %23 to float
-  %27 = fdiv float %22, %phitmp, !fpmath !41
-  %28 = sext i32 %2 to i64
-  %29 = getelementptr inbounds float addrspace(3)* %Avg, i64 %28
-  store float %27, float addrspace(3)* %29, align 4, !tbaa !37
-  br i1 %15, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %26, %.lr.ph
-  %stddev.098 = phi float [ %36, %.lr.ph ], [ 0.000000e+00, %26 ]
-  %addr2.097 = phi i32 [ %37, %.lr.ph ], [ %12, %26 ]
-  %30 = sext i32 %addr2.097 to i64
-  %31 = getelementptr inbounds i32 addrspace(1)* %input, i64 %30
-  %32 = load i32 addrspace(1)* %31, align 4, !tbaa !40
-  %33 = uitofp i32 %32 to float
-  %34 = fsub float %33, %27
-  %35 = fmul float %34, %34
-  %36 = fadd float %stddev.098, %35
-  %37 = add nsw i32 %addr2.097, %4
-  %38 = icmp slt i32 %37, %14
-  br i1 %38, label %.lr.ph, label %._crit_edge
-
-._crit_edge:                                      ; preds = %.lr.ph, %.thread, %26
-  %39 = phi float addrspace(3)* [ %29, %26 ], [ %17, %.thread ], [ %29, %.lr.ph ]
-  %40 = phi i64 [ %28, %26 ], [ %16, %.thread ], [ %28, %.lr.ph ]
-  %count.0.lcssa109 = phi float [ %phitmp, %26 ], [ 0.000000e+00, %.thread ], [ %phitmp, %.lr.ph ]
-  %stddev.0.lcssa = phi float [ 0.000000e+00, %26 ], [ 0.000000e+00, %.thread ], [ %36, %.lr.ph ]
-  %41 = fdiv float %stddev.0.lcssa, %count.0.lcssa109, !fpmath !41
-  %42 = tail call cc75 float @_Z4sqrtf(float %41) nounwind readnone
-  %43 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %40
-  store float %42, float addrspace(3)* %43, align 4, !tbaa !37
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %44 = icmp slt i32 %2, 256
-  br i1 %44, label %45, label %.thread110
-
-.thread110:                                       ; preds = %._crit_edge
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  br label %.thread111
-
-; <label>:45                                      ; preds = %._crit_edge
-  %46 = add nsw i32 %2, 256
-  %47 = sext i32 %46 to i64
-  %48 = getelementptr inbounds float addrspace(3)* %Avg, i64 %47
-  %49 = load float addrspace(3)* %48, align 4, !tbaa !37
-  %50 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %51 = fadd float %49, %50
-  store float %51, float addrspace(3)* %39, align 4, !tbaa !37
-  %52 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %47
-  %53 = load float addrspace(3)* %52, align 4, !tbaa !37
-  %54 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %55 = fadd float %53, %54
-  store float %55, float addrspace(3)* %43, align 4, !tbaa !37
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %56 = icmp slt i32 %2, 128
-  br i1 %56, label %124, label %.thread111
-
-; <label>:57                                      ; preds = %149
-  %58 = add nsw i32 %2, 16
-  %59 = sext i32 %58 to i64
-  %60 = getelementptr inbounds float addrspace(3)* %Avg, i64 %59
-  %61 = load float addrspace(3)* %60, align 4, !tbaa !37
-  %62 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %63 = fadd float %61, %62
-  store float %63, float addrspace(3)* %39, align 4, !tbaa !37
-  %64 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %59
-  %65 = load float addrspace(3)* %64, align 4, !tbaa !37
-  %66 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %67 = fadd float %65, %66
-  store float %67, float addrspace(3)* %43, align 4, !tbaa !37
-  %68 = icmp slt i32 %2, 8
-  br i1 %68, label %69, label %.thread95
-
-; <label>:69                                      ; preds = %57
-  %70 = add nsw i32 %2, 8
-  %71 = sext i32 %70 to i64
-  %72 = getelementptr inbounds float addrspace(3)* %Avg, i64 %71
-  %73 = load float addrspace(3)* %72, align 4, !tbaa !37
-  %74 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %75 = fadd float %73, %74
-  store float %75, float addrspace(3)* %39, align 4, !tbaa !37
-  %76 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %71
-  %77 = load float addrspace(3)* %76, align 4, !tbaa !37
-  %78 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %79 = fadd float %77, %78
-  store float %79, float addrspace(3)* %43, align 4, !tbaa !37
-  %80 = icmp slt i32 %2, 4
-  br i1 %80, label %.thread92, label %.thread95
-
-.thread92:                                        ; preds = %69
-  %81 = add nsw i32 %2, 4
-  %82 = sext i32 %81 to i64
-  %83 = getelementptr inbounds float addrspace(3)* %Avg, i64 %82
-  %84 = load float addrspace(3)* %83, align 4, !tbaa !37
-  %85 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %86 = fadd float %84, %85
-  store float %86, float addrspace(3)* %39, align 4, !tbaa !37
-  %87 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %82
-  %88 = load float addrspace(3)* %87, align 4, !tbaa !37
-  %89 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %90 = fadd float %88, %89
-  store float %90, float addrspace(3)* %43, align 4, !tbaa !37
-  %91 = icmp slt i32 %2, 2
-  br i1 %91, label %92, label %.thread95
-
-; <label>:92                                      ; preds = %.thread92
-  %93 = add nsw i32 %2, 2
-  %94 = sext i32 %93 to i64
-  %95 = getelementptr inbounds float addrspace(3)* %Avg, i64 %94
-  %96 = load float addrspace(3)* %95, align 4, !tbaa !37
-  %97 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %98 = fadd float %96, %97
-  store float %98, float addrspace(3)* %39, align 4, !tbaa !37
-  %99 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %94
-  %100 = load float addrspace(3)* %99, align 4, !tbaa !37
-  %101 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %102 = fadd float %100, %101
-  store float %102, float addrspace(3)* %43, align 4, !tbaa !37
-  %103 = icmp eq i32 %2, 0
-  br i1 %103, label %104, label %.thread95
-
-; <label>:104                                     ; preds = %92
-  %105 = load float addrspace(3)* %Avg, align 4, !tbaa !37
-  %106 = getelementptr inbounds float addrspace(3)* %Avg, i64 1
-  %107 = load float addrspace(3)* %106, align 4, !tbaa !37
-  %108 = fadd float %105, %107
-  %109 = fmul float %108, 1.953125e-03
-  %110 = load float addrspace(3)* %StdDev, align 4, !tbaa !37
-  %111 = getelementptr inbounds float addrspace(3)* %StdDev, i64 1
-  %112 = load float addrspace(3)* %111, align 4, !tbaa !37
-  %113 = fadd float %110, %112
-  %114 = fmul float %113, 1.953125e-03
-  %115 = tail call float @llvm.fmuladd.f32(float -1.000000e+01, float %114, float %109)
-  %116 = fptoui float %115 to i32
-  %117 = udiv i32 %116, 24576
-  %118 = tail call cc75 i32 @_Z8atom_minPU3AS1jj(i32 addrspace(1)* %minmax, i32 %117) nounwind
-  %119 = getelementptr inbounds i32 addrspace(1)* %minmax, i64 1
-  %120 = tail call float @llvm.fmuladd.f32(float 1.000000e+01, float %114, float %109)
-  %121 = fptoui float %120 to i32
-  %122 = udiv i32 %121, 24576
-  %123 = tail call cc75 i32 @_Z8atom_maxPU3AS1jj(i32 addrspace(1)* %119, i32 %122) nounwind
-  br label %.thread95
-
-.thread95:                                        ; preds = %147, %.thread112, %57, %149, %69, %.thread92, %104, %92
-  ret void
-
-.thread111:                                       ; preds = %.thread110, %45
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  br label %147
-
-; <label>:124                                     ; preds = %45
-  %125 = add nsw i32 %2, 128
-  %126 = sext i32 %125 to i64
-  %127 = getelementptr inbounds float addrspace(3)* %Avg, i64 %126
-  %128 = load float addrspace(3)* %127, align 4, !tbaa !37
-  %129 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %130 = fadd float %128, %129
-  store float %130, float addrspace(3)* %39, align 4, !tbaa !37
-  %131 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %126
-  %132 = load float addrspace(3)* %131, align 4, !tbaa !37
-  %133 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %134 = fadd float %132, %133
-  store float %134, float addrspace(3)* %43, align 4, !tbaa !37
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %135 = icmp slt i32 %2, 64
-  br i1 %135, label %136, label %.thread112
-
-.thread112:                                       ; preds = %124
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  br label %.thread95
-
-; <label>:136                                     ; preds = %124
-  %137 = add nsw i32 %2, 64
-  %138 = sext i32 %137 to i64
-  %139 = getelementptr inbounds float addrspace(3)* %Avg, i64 %138
-  %140 = load float addrspace(3)* %139, align 4, !tbaa !37
-  %141 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %142 = fadd float %140, %141
-  store float %142, float addrspace(3)* %39, align 4, !tbaa !37
-  %143 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %138
-  %144 = load float addrspace(3)* %143, align 4, !tbaa !37
-  %145 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %146 = fadd float %144, %145
-  store float %146, float addrspace(3)* %43, align 4, !tbaa !37
-  br label %147
-
-; <label>:147                                     ; preds = %.thread111, %136
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %148 = icmp slt i32 %2, 32
-  br i1 %148, label %149, label %.thread95
-
-; <label>:149                                     ; preds = %147
-  %150 = add nsw i32 %2, 32
-  %151 = sext i32 %150 to i64
-  %152 = getelementptr inbounds float addrspace(3)* %Avg, i64 %151
-  %153 = load float addrspace(3)* %152, align 4, !tbaa !37
-  %154 = load float addrspace(3)* %39, align 4, !tbaa !37
-  %155 = fadd float %153, %154
-  store float %155, float addrspace(3)* %39, align 4, !tbaa !37
-  %156 = getelementptr inbounds float addrspace(3)* %StdDev, i64 %151
-  %157 = load float addrspace(3)* %156, align 4, !tbaa !37
-  %158 = load float addrspace(3)* %43, align 4, !tbaa !37
-  %159 = fadd float %157, %158
-  store float %159, float addrspace(3)* %43, align 4, !tbaa !37
-  %160 = icmp slt i32 %2, 16
-  br i1 %160, label %57, label %.thread95
-}
-
-declare cc75 i64 @_Z12get_local_idj(i32) nounwind readnone
-
-declare cc75 i64 @_Z14get_local_sizej(i32) nounwind readnone
-
-declare cc75 i64 @_Z12get_group_idj(i32) nounwind readnone
-
-declare cc75 i64 @_Z14get_num_groupsj(i32) nounwind readnone
-
-declare cc75 float @_Z4sqrtf(float) nounwind readnone
-
-declare cc75 void @_Z7barrierj(i32)
-
-declare cc75 i32 @_Z8atom_minPU3AS1jj(i32 addrspace(1)*, i32)
-
-declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
-
-declare cc75 i32 @_Z8atom_maxPU3AS1jj(i32 addrspace(1)*, i32)
-
-define cc76 void @calculateBin(i32 %bin, <4 x i8> addrspace(1)* nocapture %sm_mapping) nounwind {
-  %1 = lshr i32 %bin, 2
-  %2 = trunc i32 %1 to i8
-  %3 = lshr i32 %bin, 10
-  %4 = urem i32 %3, 24
-  %5 = trunc i32 %4 to i8
-  %6 = udiv i32 %bin, 24576
-  %7 = trunc i32 %6 to i8
-  %8 = shl i32 %bin, 3
-  %.tr = trunc i32 %8 to i8
-  %9 = and i8 %.tr, 24
-  %10 = insertelement <4 x i8> undef, i8 %7, i32 0
-  %11 = insertelement <4 x i8> %10, i8 %5, i32 1
-  %12 = insertelement <4 x i8> %11, i8 %2, i32 2
-  %13 = insertelement <4 x i8> %12, i8 %9, i32 3
-  store <4 x i8> %13, <4 x i8> addrspace(1)* %sm_mapping, align 4, !tbaa !38
-  ret void
-}
-
-define cc76 void @histo_intermediates_kernel(<2 x i32> addrspace(1)* nocapture %input, i64 %b1, i32 %height, i32 %width, i32 %input_pitch, <4 x i8> addrspace(1)* nocapture %sm_mappings, i64 %b2) nounwind {
-  tail call void @llvm.trap()
-  unreachable
-}
-
-define cc75 void @testIncrementGlobal(i32 addrspace(1)* %global_histo, i32 %sm_range_min, i32 %sm_range_max, <4 x i8> %sm) nounwind {
-  %1 = extractelement <4 x i8> %sm, i32 0
-  %2 = zext i8 %1 to i32
-  %3 = icmp ult i32 %2, %sm_range_min
-  %4 = icmp ugt i32 %2, %sm_range_max
-  %or.cond = or i1 %3, %4
-  br i1 %or.cond, label %5, label %31
-
-; <label>:5                                       ; preds = %0
-  %6 = extractelement <4 x i8> %sm, i32 3
-  %7 = extractelement <4 x i8> %sm, i32 2
-  %8 = extractelement <4 x i8> %sm, i32 1
-  %9 = zext i8 %6 to i32
-  %10 = zext i8 %7 to i32
-  %11 = zext i8 %8 to i32
-  %12 = mul i32 %2, 24576
-  %13 = lshr i32 %9, 3
-  %14 = or i32 %12, %13
-  %15 = shl nuw nsw i32 %10, 2
-  %16 = shl nuw nsw i32 %11, 10
-  %17 = or i32 %16, %15
-  %18 = add i32 %17, %14
-  %19 = lshr i32 %18, 1
-  %20 = shl nsw i32 %18, 4
-  %21 = and i32 %20, 16
-  %22 = zext i32 %19 to i64
-  %23 = getelementptr inbounds i32 addrspace(1)* %global_histo, i64 %22
-  %24 = load i32 addrspace(1)* %23, align 4, !tbaa !40
-  %25 = lshr i32 %24, %21
-  %26 = and i32 %25, 65535
-  %27 = icmp ult i32 %26, 255
-  br i1 %27, label %28, label %31
-
-; <label>:28                                      ; preds = %5
-  %29 = shl i32 1, %21
-  %30 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %23, i32 %29) nounwind
-  br label %31
-
-; <label>:31                                      ; preds = %0, %5, %28
-  ret void
-}
-
-declare cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)*, i32)
-
-define cc75 void @testIncrementLocal(i32 addrspace(1)* %global_overflow, i32 addrspace(3)* %smem, i32 %myRange, <4 x i8> %sm) nounwind {
-  %1 = extractelement <4 x i8> %sm, i32 0
-  %2 = zext i8 %1 to i32
-  %3 = extractelement <4 x i8> %sm, i32 1
-  %4 = zext i8 %3 to i32
-  %5 = extractelement <4 x i8> %sm, i32 2
-  %6 = zext i8 %5 to i32
-  %7 = extractelement <4 x i8> %sm, i32 3
-  %8 = zext i8 %7 to i32
-  %9 = icmp eq i32 %2, %myRange
-  br i1 %9, label %10, label %76
-
-; <label>:10                                      ; preds = %0
-  %11 = shl i32 1, %8
-  %12 = shl nuw nsw i32 %4, 8
-  %13 = or i32 %12, %6
-  %14 = zext i32 %13 to i64
-  %15 = getelementptr inbounds i32 addrspace(3)* %smem, i64 %14
-  %16 = tail call cc75 i32 @_Z8atom_addPU3AS3jj(i32 addrspace(3)* %15, i32 %11) nounwind
-  %17 = lshr i32 %16, %8
-  %18 = and i32 %17, 255
-  %19 = icmp eq i32 %18, 255
-  br i1 %19, label %20, label %76
-
-; <label>:20                                      ; preds = %10
-  %21 = mul i32 %myRange, 24576
-  %22 = lshr i32 %8, 3
-  %23 = or i32 %21, %22
-  %24 = shl nuw nsw i32 %6, 2
-  %25 = shl nuw nsw i32 %4, 10
-  %26 = or i32 %25, %24
-  %27 = add i32 %26, %23
-  %28 = icmp ult i8 %7, 24
-  %29 = icmp ult i8 %7, 16
-  %30 = icmp ult i8 %7, 8
-  %31 = add i32 %8, 8
-  %32 = lshr i32 %16, %31
-  %33 = and i32 %32, 255
-  %34 = add i32 %8, 16
-  %35 = lshr i32 %16, %34
-  %36 = and i32 %35, 255
-  %37 = add i32 %8, 24
-  %38 = lshr i32 %16, %37
-  %39 = and i32 %38, 255
-  %40 = icmp eq i32 %33, 255
-  %or.cond = and i1 %29, %40
-  %41 = icmp eq i32 %36, 255
-  %or.cond33 = and i1 %30, %41
-  br i1 %28, label %42, label %.thread
-
-.thread:                                          ; preds = %20
-  br i1 %or.cond, label %45, label %48
-
-; <label>:42                                      ; preds = %20
-  %43 = icmp ne i32 %33, 255
-  %44 = select i1 %43, i32 -1, i32 255
-  br i1 %or.cond, label %45, label %48
-
-; <label>:45                                      ; preds = %.thread, %42
-  %bin_plus_1_add.050 = phi i32 [ %44, %42 ], [ undef, %.thread ]
-  %overflow_into_bin_plus_1.0.off04048 = phi i1 [ true, %42 ], [ false, %.thread ]
-  %46 = icmp ne i32 %36, 255
-  %47 = select i1 %46, i32 -1, i32 255
-  br i1 %or.cond33, label %49, label %55
-
-; <label>:48                                      ; preds = %.thread, %42
-  %bin_plus_1_add.049 = phi i32 [ %44, %42 ], [ undef, %.thread ]
-  %overflow_into_bin_plus_1.0.off04047 = phi i1 [ true, %42 ], [ false, %.thread ]
-  br i1 %or.cond33, label %49, label %55
-
-; <label>:49                                      ; preds = %45, %48
-  %bin_plus_2_add.060 = phi i32 [ %47, %45 ], [ undef, %48 ]
-  %.34414556 = phi i1 [ true, %45 ], [ false, %48 ]
-  %overflow_into_bin_plus_1.0.off0404754 = phi i1 [ %overflow_into_bin_plus_1.0.off04048, %45 ], [ %overflow_into_bin_plus_1.0.off04047, %48 ]
-  %bin_plus_1_add.04952 = phi i32 [ %bin_plus_1_add.050, %45 ], [ %bin_plus_1_add.049, %48 ]
-  %50 = icmp ne i32 %39, 255
-  %51 = select i1 %50, i32 -1, i32 255
-  %52 = zext i32 %27 to i64
-  %53 = getelementptr inbounds i32 addrspace(1)* %global_overflow, i64 %52
-  %54 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %53, i32 256) nounwind
-  br i1 %overflow_into_bin_plus_1.0.off0404754, label %59, label %64
-
-; <label>:55                                      ; preds = %45, %48
-  %bin_plus_2_add.059 = phi i32 [ undef, %48 ], [ %47, %45 ]
-  %.34414555 = phi i1 [ false, %48 ], [ true, %45 ]
-  %overflow_into_bin_plus_1.0.off0404753 = phi i1 [ %overflow_into_bin_plus_1.0.off04047, %48 ], [ %overflow_into_bin_plus_1.0.off04048, %45 ]
-  %bin_plus_1_add.04951 = phi i32 [ %bin_plus_1_add.049, %48 ], [ %bin_plus_1_add.050, %45 ]
-  %56 = zext i32 %27 to i64
-  %57 = getelementptr inbounds i32 addrspace(1)* %global_overflow, i64 %56
-  %58 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %57, i32 256) nounwind
-  br i1 %overflow_into_bin_plus_1.0.off0404753, label %59, label %64
-
-; <label>:59                                      ; preds = %49, %55
-  %bin_plus_3_add.069 = phi i32 [ %51, %49 ], [ undef, %55 ]
-  %bin_plus_1_add.0495167 = phi i32 [ %bin_plus_1_add.04952, %49 ], [ %bin_plus_1_add.04951, %55 ]
-  %.3441455566 = phi i1 [ %.34414556, %49 ], [ %.34414555, %55 ]
-  %.3542435764 = phi i1 [ true, %49 ], [ false, %55 ]
-  %bin_plus_2_add.05962 = phi i32 [ %bin_plus_2_add.060, %49 ], [ %bin_plus_2_add.059, %55 ]
-  %60 = add i32 %27, 1
-  %61 = zext i32 %60 to i64
-  %62 = getelementptr inbounds i32 addrspace(1)* %global_overflow, i64 %61
-  %63 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %62, i32 %bin_plus_1_add.0495167) nounwind
-  br i1 %.3441455566, label %65, label %70
-
-; <label>:64                                      ; preds = %49, %55
-  %bin_plus_3_add.068 = phi i32 [ %51, %49 ], [ undef, %55 ]
-  %.3441455565 = phi i1 [ %.34414556, %49 ], [ %.34414555, %55 ]
-  %.3542435763 = phi i1 [ true, %49 ], [ false, %55 ]
-  %bin_plus_2_add.05961 = phi i32 [ %bin_plus_2_add.060, %49 ], [ %bin_plus_2_add.059, %55 ]
-  br i1 %.3441455565, label %65, label %70
-
-; <label>:65                                      ; preds = %59, %64
-  %bin_plus_2_add.0596174 = phi i32 [ %bin_plus_2_add.05962, %59 ], [ %bin_plus_2_add.05961, %64 ]
-  %.354243576373 = phi i1 [ %.3542435764, %59 ], [ %.3542435763, %64 ]
-  %bin_plus_3_add.06871 = phi i32 [ %bin_plus_3_add.069, %59 ], [ %bin_plus_3_add.068, %64 ]
-  %66 = add i32 %27, 2
-  %67 = zext i32 %66 to i64
-  %68 = getelementptr inbounds i32 addrspace(1)* %global_overflow, i64 %67
-  %69 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %68, i32 %bin_plus_2_add.0596174) nounwind
-  br i1 %.354243576373, label %71, label %76
-
-; <label>:70                                      ; preds = %59, %64
-  %.354243576372 = phi i1 [ %.3542435764, %59 ], [ %.3542435763, %64 ]
-  %bin_plus_3_add.06870 = phi i32 [ %bin_plus_3_add.069, %59 ], [ %bin_plus_3_add.068, %64 ]
-  br i1 %.354243576372, label %71, label %76
-
-; <label>:71                                      ; preds = %65, %70
-  %bin_plus_3_add.0687075 = phi i32 [ %bin_plus_3_add.06871, %65 ], [ %bin_plus_3_add.06870, %70 ]
-  %72 = add i32 %27, 3
-  %73 = zext i32 %72 to i64
-  %74 = getelementptr inbounds i32 addrspace(1)* %global_overflow, i64 %73
-  %75 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %74, i32 %bin_plus_3_add.0687075) nounwind
-  br label %76
-
-; <label>:76                                      ; preds = %65, %10, %71, %70, %0
-  ret void
-}
-
-declare cc75 i32 @_Z8atom_addPU3AS3jj(i32 addrspace(3)*, i32)
-
-define cc75 void @clearMemory(i32 addrspace(3)* nocapture %smem) nounwind {
-  %1 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
-  %2 = trunc i64 %1 to i32
-  %3 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
-  %4 = trunc i64 %3 to i32
-  %5 = icmp slt i32 %2, 6144
-  br i1 %5, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %i.03 = phi i32 [ %8, %.lr.ph ], [ %2, %0 ]
-  %6 = sext i32 %i.03 to i64
-  %7 = getelementptr inbounds i32 addrspace(3)* %smem, i64 %6
-  store i32 0, i32 addrspace(3)* %7, align 4, !tbaa !40
-  %8 = add nsw i32 %i.03, %4
-  %9 = icmp slt i32 %8, 6144
-  br i1 %9, label %.lr.ph, label %._crit_edge
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret void
-}
-
-define cc75 void @copyMemory(i32 addrspace(1)* nocapture %dst, i32 addrspace(3)* nocapture %src) nounwind {
-  %1 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
-  %2 = trunc i64 %1 to i32
-  %3 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
-  %4 = trunc i64 %3 to i32
-  %5 = icmp slt i32 %2, 6144
-  br i1 %5, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %i.05 = phi i32 [ %10, %.lr.ph ], [ %2, %0 ]
-  %6 = sext i32 %i.05 to i64
-  %7 = getelementptr inbounds i32 addrspace(3)* %src, i64 %6
-  %8 = load i32 addrspace(3)* %7, align 4, !tbaa !40
-  %9 = getelementptr inbounds i32 addrspace(1)* %dst, i64 %6
-  store i32 %8, i32 addrspace(1)* %9, align 4, !tbaa !40
-  %10 = add nsw i32 %i.05, %4
-  %11 = icmp slt i32 %10, 6144
-  br i1 %11, label %.lr.ph, label %._crit_edge
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret void
-}
-
-define cc76 void @histo_main_kernel(<4 x i8> addrspace(1)* nocapture %sm_mappings, i64 %b1, i32 %num_elements, i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32 addrspace(1)* nocapture %global_subhisto, i64 %b3, i32 addrspace(1)* %global_histo, i64 %b4, i32 addrspace(1)* %global_overflow, i64 %b5, i32 addrspace(3)* %sub_histo, i64 %b6) nounwind {
-  %1 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
-  %2 = trunc i64 %1 to i32
-  %3 = tail call cc75 i64 @_Z14get_num_groupsj(i32 0) nounwind readnone
-  %4 = trunc i64 %3 to i32
-  %5 = zext i32 %sm_range_min to i64
-  %6 = tail call cc75 i64 @_Z12get_group_idj(i32 1) nounwind readnone
-  %7 = add i64 %6, %5
-  %8 = trunc i64 %7 to i32
-  %9 = tail call cc75 i64 @_Z12get_group_idj(i32 0) nounwind readnone
-  %10 = and i64 %1, 4294967295
-  %11 = mul i64 %9, %10
-  %12 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
-  %13 = add i64 %11, %12
-  %14 = trunc i64 %13 to i32
-  %15 = trunc i64 %12 to i32
-  %16 = icmp slt i32 %15, 6144
-  br i1 %16, label %.lr.ph.i, label %clearMemory.exit
-
-.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
-  %i.03.i = phi i32 [ %19, %.lr.ph.i ], [ %15, %0 ]
-  %17 = sext i32 %i.03.i to i64
-  %18 = getelementptr inbounds i32 addrspace(3)* %sub_histo, i64 %17
-  store i32 0, i32 addrspace(3)* %18, align 4, !tbaa !40
-  %19 = add nsw i32 %i.03.i, %2
-  %20 = icmp slt i32 %19, 6144
-  br i1 %20, label %.lr.ph.i, label %clearMemory.exit
-
-clearMemory.exit:                                 ; preds = %.lr.ph.i, %0
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %21 = icmp eq i64 %6, 0
-  %22 = icmp ult i32 %14, %num_elements
-  br i1 %21, label %testIncrementGlobal.exit.preheader, label %.preheader
-
-.preheader:                                       ; preds = %clearMemory.exit
-  br i1 %22, label %.lr.ph33, label %.loopexit
-
-.lr.ph33:                                         ; preds = %.preheader
-  %23 = mul i32 %4, %2
-  br label %61
-
-testIncrementGlobal.exit.preheader:               ; preds = %clearMemory.exit
-  br i1 %22, label %.lr.ph, label %.loopexit
-
-.lr.ph:                                           ; preds = %testIncrementGlobal.exit.preheader
-  %24 = mul i32 %4, %2
-  br label %25
-
-; <label>:25                                      ; preds = %.lr.ph, %testIncrementGlobal.exit.backedge
-  %local_scan_load.030 = phi i32 [ %14, %.lr.ph ], [ %29, %testIncrementGlobal.exit.backedge ]
-  %26 = zext i32 %local_scan_load.030 to i64
-  %27 = getelementptr inbounds <4 x i8> addrspace(1)* %sm_mappings, i64 %26
-  %28 = load <4 x i8> addrspace(1)* %27, align 4, !tbaa !38
-  %29 = add i32 %local_scan_load.030, %24
-  tail call cc75 void @testIncrementLocal(i32 addrspace(1)* %global_overflow, i32 addrspace(3)* %sub_histo, i32 %8, <4 x i8> %28)
-  %30 = extractelement <4 x i8> %28, i32 0
-  %31 = zext i8 %30 to i32
-  %32 = icmp ult i32 %31, %sm_range_min
-  %33 = icmp ugt i32 %31, %sm_range_max
-  %or.cond.i = or i1 %32, %33
-  br i1 %or.cond.i, label %35, label %testIncrementGlobal.exit.backedge
-
-testIncrementGlobal.exit.backedge:                ; preds = %25, %35, %58
-  %34 = icmp ult i32 %29, %num_elements
-  br i1 %34, label %25, label %.loopexit
-
-; <label>:35                                      ; preds = %25
-  %36 = extractelement <4 x i8> %28, i32 3
-  %37 = extractelement <4 x i8> %28, i32 2
-  %38 = extractelement <4 x i8> %28, i32 1
-  %39 = zext i8 %36 to i32
-  %40 = zext i8 %37 to i32
-  %41 = zext i8 %38 to i32
-  %42 = mul i32 %31, 24576
-  %43 = lshr i32 %39, 3
-  %44 = or i32 %42, %43
-  %45 = shl nuw nsw i32 %40, 2
-  %46 = shl nuw nsw i32 %41, 10
-  %47 = or i32 %46, %45
-  %48 = add i32 %47, %44
-  %49 = lshr i32 %48, 1
-  %50 = shl nsw i32 %48, 4
-  %51 = and i32 %50, 16
-  %52 = zext i32 %49 to i64
-  %53 = getelementptr inbounds i32 addrspace(1)* %global_histo, i64 %52
-  %54 = load i32 addrspace(1)* %53, align 4, !tbaa !40
-  %55 = lshr i32 %54, %51
-  %56 = and i32 %55, 65535
-  %57 = icmp ult i32 %56, 255
-  br i1 %57, label %58, label %testIncrementGlobal.exit.backedge
-
-; <label>:58                                      ; preds = %35
-  %59 = shl i32 1, %51
-  %60 = tail call cc75 i32 @_Z8atom_addPU3AS1jj(i32 addrspace(1)* %53, i32 %59) nounwind
-  br label %testIncrementGlobal.exit.backedge
-
-; <label>:61                                      ; preds = %.lr.ph33, %61
-  %local_scan_load.132 = phi i32 [ %14, %.lr.ph33 ], [ %65, %61 ]
-  %62 = zext i32 %local_scan_load.132 to i64
-  %63 = getelementptr inbounds <4 x i8> addrspace(1)* %sm_mappings, i64 %62
-  %64 = load <4 x i8> addrspace(1)* %63, align 4, !tbaa !38
-  %65 = add i32 %local_scan_load.132, %23
-  tail call cc75 void @testIncrementLocal(i32 addrspace(1)* %global_overflow, i32 addrspace(3)* %sub_histo, i32 %8, <4 x i8> %64)
-  %66 = icmp ult i32 %65, %num_elements
-  br i1 %66, label %61, label %.loopexit
-
-.loopexit:                                        ; preds = %.preheader, %61, %testIncrementGlobal.exit.preheader, %testIncrementGlobal.exit.backedge
-  %67 = mul i32 %histo_width, %histo_height
-  %68 = lshr i32 %67, 2
-  %69 = zext i32 %68 to i64
-  %70 = mul i64 %9, %69
-  %71 = mul i32 %8, 24576
-  %72 = lshr exact i32 %71, 2
-  %73 = zext i32 %72 to i64
-  %74 = add i64 %73, %70
-  tail call cc75 void @_Z7barrierj(i32 1) nounwind
-  %75 = and i64 %74, 4294967295
-  br i1 %16, label %.lr.ph.i29, label %copyMemory.exit
-
-.lr.ph.i29:                                       ; preds = %.loopexit, %.lr.ph.i29
-  %i.05.i = phi i32 [ %80, %.lr.ph.i29 ], [ %15, %.loopexit ]
-  %76 = sext i32 %i.05.i to i64
-  %77 = getelementptr inbounds i32 addrspace(3)* %sub_histo, i64 %76
-  %78 = load i32 addrspace(3)* %77, align 4, !tbaa !40
-  %.sum = add i64 %76, %75
-  %79 = getelementptr inbounds i32 addrspace(1)* %global_subhisto, i64 %.sum
-  store i32 %78, i32 addrspace(1)* %79, align 4, !tbaa !40
-  %80 = add nsw i32 %i.05.i, %2
-  %81 = icmp slt i32 %80, 6144
-  br i1 %81, label %.lr.ph.i29, label %copyMemory.exit
-
-copyMemory.exit:                                  ; preds = %.lr.ph.i29, %.loopexit
-  ret void
-}
-
-define cc76 void @histo_final_kernel(i32 %sm_range_min, i32 %sm_range_max, i32 %histo_height, i32 %histo_width, i32 addrspace(1)* nocapture %global_subhisto, i64 %b1, i32 addrspace(1)* nocapture %global_histo, i64 %b2, i32 addrspace(1)* nocapture %global_overflow, i64 %b3, i32 addrspace(1)* nocapture %final_histo, i64 %b4) nounwind {
-  %1 = tail call cc75 i64 @_Z14get_local_sizej(i32 0) nounwind readnone
-  %2 = trunc i64 %1 to i32
-  %3 = tail call cc75 i64 @_Z14get_num_groupsj(i32 0) nounwind readnone
-  %4 = trunc i64 %3 to i32
-  %5 = tail call cc75 i64 @_Z12get_local_idj(i32 0) nounwind readnone
-  %6 = tail call cc75 i64 @_Z12get_group_idj(i32 0) nounwind readnone
-  %7 = and i64 %1, 4294967295
-  %8 = mul i64 %6, %7
-  %9 = add i64 %8, %5
-  %10 = trunc i64 %9 to i32
-  %11 = mul i32 %sm_range_min, 24576
-  %12 = sub i32 %sm_range_max, %sm_range_min
-  %13 = mul i32 %12, 24576
-  %14 = lshr exact i32 %11, 2
-  %15 = icmp ult i32 %10, %14
-  br i1 %15, label %.lr.ph111, label %._crit_edge112
-
-.lr.ph111:                                        ; preds = %0
-  %16 = bitcast i32 addrspace(1)* %global_histo to <4 x i16> addrspace(1)*
-  %17 = mul i32 %4, %2
-  br label %18
-
-; <label>:18                                      ; preds = %.lr.ph111, %18
-  %i.0109 = phi i32 [ %10, %.lr.ph111 ], [ %40, %18 ]
-  %19 = zext i32 %i.0109 to i64
-  %20 = getelementptr inbounds <4 x i16> addrspace(1)* %16, i64 %19
-  %21 = load <4 x i16> addrspace(1)* %20, align 8, !tbaa !38
-  store <4 x i16> zeroinitializer, <4 x i16> addrspace(1)* %20, align 8, !tbaa !38
-  %22 = extractelement <4 x i16> %21, i32 0
-  %23 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %22, i16 zeroext 255) nounwind readnone
-  %24 = extractelement <4 x i16> %21, i32 1
-  %25 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %24, i16 zeroext 255) nounwind readnone
-  %26 = extractelement <4 x i16> %21, i32 2
-  %27 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %26, i16 zeroext 255) nounwind readnone
-  %28 = extractelement <4 x i16> %21, i32 3
-  %29 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %28, i16 zeroext 255) nounwind readnone
-  %30 = trunc i16 %23 to i8
-  %31 = insertelement <4 x i8> undef, i8 %30, i32 0
-  %32 = trunc i16 %25 to i8
-  %33 = insertelement <4 x i8> %31, i8 %32, i32 1
-  %34 = trunc i16 %27 to i8
-  %35 = insertelement <4 x i8> %33, i8 %34, i32 2
-  %36 = trunc i16 %29 to i8
-  %37 = insertelement <4 x i8> %35, i8 %36, i32 3
-  %38 = getelementptr inbounds i32 addrspace(1)* %final_histo, i64 %19
-  %39 = bitcast i32 addrspace(1)* %38 to <4 x i8> addrspace(1)*
-  store <4 x i8> %37, <4 x i8> addrspace(1)* %39, align 4, !tbaa !38
-  %40 = add i32 %i.0109, %17
-  %41 = icmp ult i32 %40, %14
-  br i1 %41, label %18, label %._crit_edge112
-
-._crit_edge112:                                   ; preds = %18, %0
-  %42 = add i32 %10, %14
-  %43 = add i32 %11, 24576
-  %44 = add i32 %43, %13
-  %45 = lshr exact i32 %44, 2
-  %46 = icmp ult i32 %42, %45
-  br i1 %46, label %.lr.ph106, label %._crit_edge117
-
-._crit_edge117:                                   ; preds = %._crit_edge112
-  %.pre = mul i32 %histo_width, %histo_height
-  br label %._crit_edge107
-
-.lr.ph106:                                        ; preds = %._crit_edge112
-  %47 = bitcast i32 addrspace(1)* %global_overflow to <4 x i32> addrspace(1)*
-  %48 = mul i32 %histo_width, %histo_height
-  %49 = mul i32 %4, %2
-  br label %50
-
-; <label>:50                                      ; preds = %.lr.ph106, %81
-  %i1.0104 = phi i32 [ %42, %.lr.ph106 ], [ %96, %81 ]
-  %51 = zext i32 %i1.0104 to i64
-  %52 = getelementptr inbounds <4 x i32> addrspace(1)* %47, i64 %51
-  %53 = load <4 x i32> addrspace(1)* %52, align 16, !tbaa !38
-  store <4 x i32> zeroinitializer, <4 x i32> addrspace(1)* %52, align 16, !tbaa !38
-  br label %54
-
-; <label>:54                                      ; preds = %50, %54
-  %internal_histo_data.0103 = phi <4 x i32> [ %53, %50 ], [ %78, %54 ]
-  %j.0102 = phi i32 [ 0, %50 ], [ %79, %54 ]
-  %55 = mul i32 %48, %j.0102
-  %56 = lshr i32 %55, 2
-  %57 = add i32 %56, %i1.0104
-  %58 = zext i32 %57 to i64
-  %59 = getelementptr inbounds i32 addrspace(1)* %global_subhisto, i64 %58
-  %60 = load i32 addrspace(1)* %59, align 4, !tbaa !40
-  %61 = and i32 %60, 255
-  %62 = extractelement <4 x i32> %internal_histo_data.0103, i32 0
-  %63 = add i32 %61, %62
-  %64 = insertelement <4 x i32> undef, i32 %63, i32 0
-  %65 = lshr i32 %60, 8
-  %66 = and i32 %65, 255
-  %67 = extractelement <4 x i32> %internal_histo_data.0103, i32 1
-  %68 = add i32 %66, %67
-  %69 = insertelement <4 x i32> %64, i32 %68, i32 1
-  %70 = lshr i32 %60, 16
-  %71 = and i32 %70, 255
-  %72 = extractelement <4 x i32> %internal_histo_data.0103, i32 2
-  %73 = add i32 %71, %72
-  %74 = insertelement <4 x i32> %69, i32 %73, i32 2
-  %75 = lshr i32 %60, 24
-  %76 = extractelement <4 x i32> %internal_histo_data.0103, i32 3
-  %77 = add i32 %75, %76
-  %78 = insertelement <4 x i32> %74, i32 %77, i32 3
-  %79 = add nsw i32 %j.0102, 1
-  %80 = icmp slt i32 %79, 14
-  br i1 %80, label %54, label %81
-
-; <label>:81                                      ; preds = %54
-  %82 = tail call cc75 i32 @_Z3minjj(i32 %63, i32 255) nounwind readnone
-  %83 = tail call cc75 i32 @_Z3minjj(i32 %68, i32 255) nounwind readnone
-  %84 = tail call cc75 i32 @_Z3minjj(i32 %73, i32 255) nounwind readnone
-  %85 = tail call cc75 i32 @_Z3minjj(i32 %77, i32 255) nounwind readnone
-  %86 = trunc i32 %82 to i8
-  %87 = insertelement <4 x i8> undef, i8 %86, i32 0
-  %88 = trunc i32 %83 to i8
-  %89 = insertelement <4 x i8> %87, i8 %88, i32 1
-  %90 = trunc i32 %84 to i8
-  %91 = insertelement <4 x i8> %89, i8 %90, i32 2
-  %92 = trunc i32 %85 to i8
-  %93 = insertelement <4 x i8> %91, i8 %92, i32 3
-  %94 = getelementptr inbounds i32 addrspace(1)* %final_histo, i64 %51
-  %95 = bitcast i32 addrspace(1)* %94 to <4 x i8> addrspace(1)*
-  store <4 x i8> %93, <4 x i8> addrspace(1)* %95, align 4, !tbaa !38
-  %96 = add i32 %i1.0104, %49
-  %97 = icmp ult i32 %96, %45
-  br i1 %97, label %50, label %._crit_edge107
-
-._crit_edge107:                                   ; preds = %81, %._crit_edge117
-  %.pre-phi = phi i32 [ %.pre, %._crit_edge117 ], [ %48, %81 ]
-  %98 = add i32 %10, %45
-  %99 = lshr i32 %.pre-phi, 2
-  %100 = icmp ult i32 %98, %99
-  br i1 %100, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %._crit_edge107
-  %101 = bitcast i32 addrspace(1)* %global_histo to <4 x i16> addrspace(1)*
-  %102 = mul i32 %4, %2
-  br label %103
-
-; <label>:103                                     ; preds = %.lr.ph, %103
-  %i4.0101 = phi i32 [ %98, %.lr.ph ], [ %125, %103 ]
-  %104 = zext i32 %i4.0101 to i64
-  %105 = getelementptr inbounds <4 x i16> addrspace(1)* %101, i64 %104
-  %106 = load <4 x i16> addrspace(1)* %105, align 8, !tbaa !38
-  store <4 x i16> zeroinitializer, <4 x i16> addrspace(1)* %105, align 8, !tbaa !38
-  %107 = extractelement <4 x i16> %106, i32 0
-  %108 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %107, i16 zeroext 255) nounwind readnone
-  %109 = extractelement <4 x i16> %106, i32 1
-  %110 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %109, i16 zeroext 255) nounwind readnone
-  %111 = extractelement <4 x i16> %106, i32 2
-  %112 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %111, i16 zeroext 255) nounwind readnone
-  %113 = extractelement <4 x i16> %106, i32 3
-  %114 = tail call cc75 zeroext i16 @_Z3mintt(i16 zeroext %113, i16 zeroext 255) nounwind readnone
-  %115 = trunc i16 %108 to i8
-  %116 = insertelement <4 x i8> undef, i8 %115, i32 0
-  %117 = trunc i16 %110 to i8
-  %118 = insertelement <4 x i8> %116, i8 %117, i32 1
-  %119 = trunc i16 %112 to i8
-  %120 = insertelement <4 x i8> %118, i8 %119, i32 2
-  %121 = trunc i16 %114 to i8
-  %122 = insertelement <4 x i8> %120, i8 %121, i32 3
-  %123 = getelementptr inbounds i32 addrspace(1)* %final_histo, i64 %104
-  %124 = bitcast i32 addrspace(1)* %123 to <4 x i8> addrspace(1)*
-  store <4 x i8> %122, <4 x i8> addrspace(1)* %124, align 4, !tbaa !38
-  %125 = add i32 %i4.0101, %102
-  %126 = icmp ult i32 %125, %99
-  br i1 %126, label %103, label %._crit_edge
-
-._crit_edge:                                      ; preds = %103, %._crit_edge107
-  ret void
-}
-
-declare cc75 zeroext i16 @_Z3mintt(i16 zeroext, i16 zeroext) nounwind readnone
-
-declare cc75 i32 @_Z3minjj(i32, i32) nounwind readnone
-
-declare void @llvm.trap() noreturn nounwind
-
-!opencl.kernels = !{!0, !7, !14, !21, !28}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!35}
-!opencl.ocl.version = !{!35}
-!opencl.used.extensions = !{!36}
-!opencl.used.optional.core.features = !{!36}
-!opencl.compiler.options = !{!36}
-
-!0 = metadata !{void (i32 addrspace(1)*, i64, i32, i32 addrspace(1)*, i64, float addrspace(3)*, i64, float addrspace(3)*, i64)* @histo_prescan_kernel, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6}
-!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 0, i32 0, i32 1, i32 0, i32 3, i32 0, i32 3, i32 0}
-!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none"}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"uint*", metadata !"long", metadata !"int", metadata !"uint*", metadata !"long", metadata !"float*", metadata !"long", metadata !"float*", metadata !"long"}
-!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !""}
-!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"uint*", metadata !"long", metadata !"int", metadata !"uint*", metadata !"long", metadata !"float*", metadata !"long", metadata !"float*", metadata !"long"}
-!6 = metadata !{metadata !"kernel_arg_name", metadata !"input", metadata !"b1", metadata !"size", metadata !"minmax", metadata !"b2", metadata !"Avg", metadata !"b3", metadata !"StdDev", metadata !"b4"}
-!7 = metadata !{void (i32, <4 x i8> addrspace(1)*)* @calculateBin, metadata !8, metadata !9, metadata !10, metadata !11, metadata !12, metadata !13}
-!8 = metadata !{metadata !"kernel_arg_addr_space", i32 0, i32 1}
-!9 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none"}
-!10 = metadata !{metadata !"kernel_arg_type", metadata !"uint", metadata !"uchar4*"}
-!11 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !""}
-!12 = metadata !{metadata !"kernel_arg_base_type", metadata !"uint", metadata !"uchar4*"}
-!13 = metadata !{metadata !"kernel_arg_name", metadata !"bin", metadata !"sm_mapping"}
-!14 = metadata !{void (<2 x i32> addrspace(1)*, i64, i32, i32, i32, <4 x i8> addrspace(1)*, i64)* @histo_intermediates_kernel, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !20}
-!15 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0}
-!16 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none"}
-!17 = metadata !{metadata !"kernel_arg_type", metadata !"uint2*", metadata !"long", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uchar4*", metadata !"long"}
-!18 = metadata !{metadata !"kernel_arg_type_qual", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !""}
-!19 = metadata !{metadata !"kernel_arg_base_type", metadata !"uint2*", metadata !"long", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uchar4*", metadata !"long"}
-!20 = metadata !{metadata !"kernel_arg_name", metadata !"input", metadata !"b1", metadata !"height", metadata !"width", metadata !"input_pitch", metadata !"sm_mappings", metadata !"b2"}
-!21 = metadata !{void (<4 x i8> addrspace(1)*, i64, i32, i32, i32, i32, i32, i32 addrspace(1)*, i64, i32 addrspace(1)*, i64, i32 addrspace(1)*, i64, i32 addrspace(3)*, i64)* @histo_main_kernel, metadata !22, metadata !23, metadata !24, metadata !25, metadata !26, metadata !27}
-!22 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 3, i32 0}
-!23 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none"}
-!24 = metadata !{metadata !"kernel_arg_type", metadata !"uchar4*", metadata !"long", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long"}
-!25 = metadata !{metadata !"kernel_arg_type_qual", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !""}
-!26 = metadata !{metadata !"kernel_arg_base_type", metadata !"uchar4*", metadata !"long", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long"}
-!27 = metadata !{metadata !"kernel_arg_name", metadata !"sm_mappings", metadata !"b1", metadata !"num_elements", metadata !"sm_range_min", metadata !"sm_range_max", metadata !"histo_height", metadata !"histo_width", metadata !"global_subhisto", metadata !"b3", metadata !"global_histo", metadata !"b4", metadata !"global_overflow", metadata !"b5", metadata !"sub_histo", metadata !"b6"}
-!28 = metadata !{void (i32, i32, i32, i32, i32 addrspace(1)*, i64, i32 addrspace(1)*, i64, i32 addrspace(1)*, i64, i32 addrspace(1)*, i64)* @histo_final_kernel, metadata !29, metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
-!29 = metadata !{metadata !"kernel_arg_addr_space", i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0}
-!30 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none", metadata !"none"}
-!31 = metadata !{metadata !"kernel_arg_type", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long"}
-!32 = metadata !{metadata !"kernel_arg_type_qual", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !"", metadata !""}
-!33 = metadata !{metadata !"kernel_arg_base_type", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long", metadata !"uint*", metadata !"long"}
-!34 = metadata !{metadata !"kernel_arg_name", metadata !"sm_range_min", metadata !"sm_range_max", metadata !"histo_height", metadata !"histo_width", metadata !"global_subhisto", metadata !"b1", metadata !"global_histo", metadata !"b2", metadata !"global_overflow", metadata !"b3", metadata !"final_histo", metadata !"b4"}
-!35 = metadata !{i32 1, i32 2}
-!36 = metadata !{}
-!37 = metadata !{metadata !"float", metadata !38}
-!38 = metadata !{metadata !"omnipotent char", metadata !39}
-!39 = metadata !{metadata !"Simple C/C++ TBAA"}
-!40 = metadata !{metadata !"int", metadata !38}
-!41 = metadata !{float 2.500000e+00}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.spir b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/kernel_x64.spir
deleted file mode 100644
index 2ff30c8c9dd7adedfad980e2ad0a21acb8fd56f7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8576
zcmaKR4O|mf*7r;@$qZyNOd!Be5}W{Ps;N65N>G%9M8vNeEOqgtj}uhtwiN@SqOF=Q
zNI-C-MHjc$4X)dV_G$NFTdi#0Zp}bM>L(Se^etM5R&Cd|SglKUt-SXpwAJqOy!j3H
z<IMk@d+x_M_naZ$6u6=x5#$*HL1a!A5g`Z?3;u9N+Ad=^Hb+Wl=Iln@<X8!jl!EOj
zwKKCs`MYKLK{`{ZCzv_WndbDuXrCr6xX4Z>X)4q4Zm*)W)!%C98bX1bIShhupqBX%
z%t&KV;&6$<vTg+Y#sqlB5NwE<Cs>RI^z;Xyw;gdw3J@ntINc~LgAfwKw;6_E10@q^
zlmPTt*vqpEqBH~uH0FoP5G3MU36B9rwvY{%_aPCH%9=#sp~Gs_MbwZytPBd?vls;D
z**_8u!UmoR^si$PWD2M=Q@v5_sv0aV?T{N3SKTc>W$gTh2AMlJpB@m1fC^>ER7_Bz
z+S3S`N}GxhE8%Wo>KzTKB&wOk5{v2)EZC%>MU~M|=N=WB*<dJi5l>m9tiScK0@`VG
zPt+22xskSrSN{odv(0Og%|z`YBtLQfwLO}fBGC?{Dt#|XPfL|y788)ck{v90*--Wy
zJ4=YHv=lt~Hwb@b>l<^QPja&UcQNE|Vst$wC<C0RMd~7|Xfll!Sf?5TQ$ph2+-&z$
z`S9s}*VI;gQwhGQ+Vi5#^Ke`K=OSt?60?QJtNi#z0}dNK(l2Oyiyb~|eN?p{hfkQe
z(T`W~_!b)92)Z0z>BrZz_!iIr-A5%{fAsD3N8jE95*`wDkA<WEB<kLyy!Q-HfuD4{
z_X|<JuqYN{DnQvI1;Q(V@(-13`IrhH)V9(B!LS4{7T*Bo@=yt28vvId|G5lHvxU{G
zK=4Wf6brkYIi~~-+Z6BJ{boJ_j0_3TW0%w*G-%m1bKj0eS<pgEO{?NX2H2|Ao|=H!
zVT&yK{nL;EL70QN{{<O<DVgi+qr4~IiG~ISc$GWi5gxW`wU13i5J)A9+Mh=3(3l_Q
z{eCfuL<1V{jg|f+Fv;~af{4S&CSd{+h_Iiv$44a9YJU@mMttBs7-(uq4PpT;yQV(*
zr|C#2J5mCPI-ny|I5VYLnWUKSYls>*y9+Z#7a()}WWC*+Ap&PrK_Z)hPdEdHG3^(7
zT`X0r=R)>6*LdonrdUOw3Co3eUWi`>pCkUIo=P9SNsDhBuCe(%>p{ull|IjkX1tp4
zRA@bbg|4s!jS760O8HPj*LJZ*bxly8bf~v!RM!SopzN}$PFhp04<?3MQaT*!1p4bC
z>T3&SYSrsQNS!O4YSJ4*Y@KT?wOd~jVwViPY0Bw#O>YuUzn`z~?##I_nsH5>+tlei
zm~viF-!#e-=vzR7jgr+?+3+nTe6x7^`zC#l>oKw=<gXKtr}pb@p}<3pXVD|n=WGlZ
z^wyAUkd3)9B)jX28M08v=$P+l>Q)=V<R3?sc%dr2!XHFq@i{78g)b<?;vb5hQ=c@Z
z^tY(`1`|7sDQ5>0gBsQK7S%RK%C~6BNh=YLFH+5~FdR3=<BL_xE9}RO3HVag3$g<n
zwD@M5r;_)qAn?t4&kCEzw#HLgVmaiR9x%=5=@c1NAs)?$`NALbd0SwOr^@EpNZ>D(
zuwIu+$K(qsXRQvFTHh9EmH;zYq{UwZuf%GyLN}IrPd^kI*dfmFiZi{gOr|vGpqHal
zgI;keRj=39G8MKU$^?y!j{Q=j?sudlsJ`Q<n>6*U0e_AQdR^nFE|%IaIj(=-HT|@B
z2AL5;>cyPb#X(U@yo|=5ZR2UElH_FW;m%(jHt7T6T%h^>PI~sT5GxNE<RM-j^2<X3
zd1ydhYp7tc5}V}L;IT_C(OSx@*M|IFS6Cm|C?%BmY*m{PtS}-SeQ0TgPsc?xyk>wx
zdV-EP{7IF*Vqmmk^6Tv(yBr)3pkCy00X<g%X^~||RY9XF=uia(@Ei#kJe7r>^}rF7
zSo%%VeBv4YPUl#4KbrETBV`+!a^8{Bg{l*DcY3H}J=7o)GvtE@=?)jn5kF3S$x^pi
z>T8x7VySOg>Ml<(U}1q#0MmdEu#w1POjvFTu>Us4pC<cLqZCLG6u6ZJC(3So<YB=9
z3!k*n#)mYR*SLQ>y1zWkNg}X%$BkffV{x8-<bkKx4(X$fOpFW}>RmZr@dWA~OAWKs
zKUwNNOFdvIgNAxrPt@r9LI!1sSBC7$kYAa=eQk;PYDo5Zfcm_L`h0-8%f-l=H?Q%m
z@Of-D&w8zAMfj9TPQxS7ZPK@Q<}`)R#z3b?sy=T`xjCrbhN@3$)MrukO{?IfzTjf+
z4pGN!F<%&{yLL);U_I~IM0;$&D4IRMSyvT$Hv2uBN&@?V3A^-eVDh4AhdOh6I;ZQZ
z&pOmsj4I$UwhgLpL9YUAQ2jMDiVbb-KEUo-VenKn<6Ejd5~sdTpvTK!)9Y*Wh8lfg
zjownDFR9V<HJYU=;uqToHqmrg&pvWKf7-HeqjpWif`Ac?KF}vTyk+D^|Ff$Z-LIsB
z(4RcKLI1Wp<{jY3{bEyE;ik01O<{Mq7JFT5g;MH^0ctQn9kY)vrpb%Q4r>B*DpjhX
zR#}Z!;M1r<LyXC*Z?4h%Yb58UA9hVYB+h+bJfq1qy+_P6h^M(-xuhcGPe?Go<D<Sp
zVs2R|o{L_`ML*vr<qfifbaV^n_KL>>fnLK4Y5O<XJ(V^O2qv;ie!K{pJ&#KQEvUM=
z!+T3T(4y+Ms`>=^NkQFVRP}*wK*<j#77WHOcO<ORC<`13uOVr#{VWGu9EHFiieCg?
zvDs5qjc;m>ZSNqy_*vAIut2!c)|B&u>VE6of#|e1f0oed*{sDYZLZ1yzC{bzBSJin
z3lke?*T2Iy+3_ufo=w2(f>|)0O+GLqF*w6Ezh?_*gXsXq_A9`L3+gV1x}R5FX;BYo
z)L<^x1UaDJ)Tplr>g(3IL*VX)cV{BFV<!`KRd7&MrbK6WO?r1{L%to~T#c`9t{|@?
zF<;T}lust}Q#u6o*%nn9Ckew`$-v2Wn`ShKXPmBX0GHN_;7TY2Mul&&i8oezz~u^L
zSFi|D1nPU-50hi&Q+LFjzW0uHU71rVJ(c$jO3xw`V+$Pj@0?-JWF-Ytbr`)5yJ~KJ
znjLv*a^cJAWxI6qjt@VD<-Qgm0;v`mr$|2EY_4L@U%OA*vySDP<$q}o+&wp?obElQ
z7q#7w-L0t0DU8!BC`~hCrmUyFAOfXv=N*MF-$^VK>%w>j*#OxWnhQle_oEe4f~B(m
z3jt|o%w{r!4VVkVuo8EwKBg?TA<kOB{jk0<Ie0f)c_(?7X~v>byf^Po;2M|b%6k39
z=U;d=eedYhCI|L_sa>2?O7@xyU+yCm0(}OX^dzFY_{iJ=0R1TM(!0j_BOxO2ZJMc6
z(w%oTR=J=?ZXQ|wsWjt<f@_v@za{H5hEt0&wh3U?<hF`W!5Uh@T0ReAoj9x*hDQhm
z{*+2H8msgh9LjuDL@9X#`jJIsZa^9oAb#(-%)r!U1YZf$mrQ+_c#;QfNSf4bi>nMg
zLkGO5ra!2vPQ<uU1CedSd7BYAD$j%+E56c_&n05KQ6rKm=?nf{a+Ih|u0`Z!Np4pE
ztnW6KK}gd?&mm<|^^STzmf`)*HCQ$xo!~4Ok_^-uyW;v-Kfl~ioP;(3(H}~jgS7#p
zYl+lKl(TQt+y9)rDpx}}1uxIomq-u#q!YYeOO4f^>LsQ!lbBVx#u!m_m&E+E&n$5$
zW9^zU1W53kfP{mDbXxQaZ7wjCY0~l@mmSnZ2Ggyf#if21=Mhw*+E#;KA<l1)T0wNh
z<+m^mgd1ft*!ow7YZf!*URR#MwWPL_C?LI|aRDK|EK2pV*n$WWSHt-I`38Y4YoY_Z
zJB|!g=TA^7$wdWGC|SS>NCQJ3U(hO^L}m{eC-?<j=T+<?{yO56Ns|`(YV1zqC9TFT
zcy)~ZHnvo&Wr`bPMNC7x6J^Tvl4WqvLs4XWIf^BcnFo4gsKgcI;ZRr-Cgu(d$s`h2
zAWU#GfY8s%{$`iHL(Yxs(+!IVxGJbx6fHd>gh$CHq<pSX9UU!wD?*qG2<<J2U!!1`
z_KxFL1aNve*InI_iz&$tS5ZFVQf#5&oMwPIy&WM`1A<LFtqEgJXL1V+_qndPe`->+
z%oNPx$R&R0WELV(FZ%flv_bL@Hccv}@4IJUT@s;qC}wi0MVXzEFj3JkzRjK2z%T|w
z!asRkBK8zI=d{;Y)bCzG3p%4KO=_-)d(IK}4z~NYX|8?12@XeTs-O2g(EwK`P{Si#
zab*YnJRKpl0YZ3ZZszN1ljn!V_V9Z(PNzbAu)Xd*X2%j|*OQ`g2AjZMjmlM6IISwY
zI}Iidxf(UI$im%B3=`ad@D6s~ix%|<R~$ZWGtIw-J$5>C{sKO;C5~<9eYf535LzZk
z{n3p`)A}`F&)kQ>o*jx1`T${2dcYvdYF=&lJ*$E_Nf!8<%@5DJmglJ<m#$`*bbayU
z?++g%c4HT+*3BX;Ni(16J>1CL%BnK2PI0?exodLHXOZU5dmp40#`5MUj@-5IQq@|O
z<b3XhRSUb;vvGI!8$!R@HKLlb_0y)*1)YZCnV0Vr>_Ix|Pi9{JCeb$Bt55{^F;Z6+
zX?rz4v~ac=F|AwXp)W{Nv3KhTVQqT8eT6AP|HBF)G?MS0pO?V|le7=2%oulJ%G!du
z0n3?dy<4|a^?7OqN^W>&j5~sV2D%mm%|MW+K)p=*xV`2HQ$k<IUJY5htS{4NL}bgP
zQ-~L=r+B>ci5HT-WdXpY#oF3dGc`!UyX2qfK*!!erKqGY)1ME)PZPZL6fHw_FC?wf
z0I>K82yP35k3z5}5rXfB!8QnHDG2U{U{ekR>pf8NR}d_yAvj=yk~c#z7X!gxLU89y
z2p$N7A3!jg4#93S0N3t<U_KUtEvXPZ7lPN+LCL%Vs-c76hFK8&b{L!h!95UcS`NX!
z9S~dv!NMd6?hk|ChhX|K2sWnyu=qC+%r!#re}*Mb5y31L{H!d3i~)5VW(@mm_>n}Z
zoFypP0c4KE)C)eBAsf=G9oQaKiBObSXdyE}@njV!goEozne|Kt=9zMW)_trp8J4>g
z#T>#=K!^nJU2G{leGWP2%Bp>fe4tD^L9_DkKEhOenRL7lV&KopB8URiahTv05UjCQ
zj3RK{dqpMPoVu%sJ!4vMQG*?+BOGf}x{PdB#**&Tj688}%z0+FLsxjm(mTSwYy%t-
zS@7Fc9A<uKKJ0*@NSW_w8W}NuAA_tUPOLZ5opK>x+9~fym5#-xpQ1^D5v4V1vd%Nk
zr!Lb$xv`%%KfX)3%NivtZMhtX{!$uGEJM+FJNkRutA6Labxdh4N{XTjB-rA%FKni^
zU4qWVVSA(}8X4vB1s5EO)3IcDDI#;x6gIrBdh(j=$;k}9QU`9`5pb)oR5t|gD|(pV
zIks^#7kg*6VI`d>UJmq!=RLAcy0(Yci3;DZ3&V4Rwyta%36KVP_?8XNvQY39bCN}L
z`lSGTzmhn&!AP$p6dPd~7cM^rWv3jM0-x=OyD)yg`eXS7?w(WA756?L^lQrx%AE&A
z*u<f#cFk~r-)povV_ggCjbfdq#C3i^I(2h)tD!HNV9J|wGnsNfELw9jTfZdz2at)p
z_=(e`{}3{<TKtw=?tcTB=o72U_16)yKB}r+Xccp<<9e1zOYLPz(JPtqlCN3vwd7k!
zjB`nChgLd4<gk0J{(P$1(Nkk>cYNaFn#+5{Gl{Y!>KfBZC`Dsx9nm=~p0BE1QY)SV
z82&hbuLKM|EnK+X-@yUm>z0PW+BM1!{yn3zP}9E7dmDQi+@tlOaa9_oW=X9%2Y`OZ
zwUq+SA)&_uYfempAi+Ml+pnJtHdmA2RJedIw-LqGy_zEMDln~Dl+;SJn~c&4*9;^@
zB(?g)d-cVT*fY@Ps}%o-T~^C1?5PPaW4_wCa<A38(@~GesiOwSLs1;8?c=FeK`qMF
zpGNkrE@=kwgfG?zrA^Fe0^`hM{`9S1H2`<yq=w1R*1yu~2{f->qk<C+vbzArAdj8D
z4^RNBEy3})WewiWgGp|TOmbMmf?$EfMTi5Of$LMGPtGzQ50&JdpMQ1?duBw}bobKC
zucps3Ud{U8m5Yo%UJ<`z<qw6EbGF;FM&7ZUU-7TQ-zX)Gz4~>}EE=VZ1i0a>E3Yg~
z`d`7r2R=(!{Qnu~M*CMq==uBS^5-1;LEJgeyZOH_O`R<%yptyz`@_8$(zWo$-^!D%
zNPI-U&+<5O=395BuiaK0(O^5&;PBaRj%=V~P26IJA7%XqWc~a7C%^gY>Nyed+r#2>
z)^6Uh{c3gNjm>|)k#KZ4d*`L<12^6)AL9&`thoAFPWjbMAN>6v+dc~n!pq9lGRixu
z+Bz57xcx?Z<7eOh`LqAC_BVAALFIg;@z-+O;zyR{#*q(_KOQuHFp%Y1GB#}E{YSSx
zB5~T<>d}=PoyVuIC?B5aKe}!+JK~U2-xQmRbvsu-KRW3Zk!8Peq;7QSB6F+$!>89P
zxW4EtsddYaAN3q}ba>OwPlsRqW?K16qw1}L>{VC({@qS7I0j)qn+obU%oO$?(7z~^
zEXcA(<L%28#z^4(AC*R=Q*LrV4?&Z~*larjLqQ+rQV5fgseq@PP=bhqa3h_L@ORKI
zV<9Olw-3S~VSL;0F<R5Rkd#`}uuyL{l>d`It4)8zpN$3Y{#SicfEwXM;19phXWozc
zC>w5LW#PEFErbX*QvgJ}CKikMUFiQ3!*S4wDBXi!nY-`Cxr`+z*jdgB(v_7I<ruD_
zpk8osmI1xM6&%|K7X|#REP{Lt>NqUoYhkVk)~W<1F|#zv!tY1MG>stg&*48!G5}8`
zU+EdmSKO+X5;5B+beOu423H42TEd;p7U>?{(tI}D*=7fK4|tP}as4#EhC4+Qf?<{o
z!@^PHGaRD`2uL66;M+AfVRT7HqRS$$Yf#!Pat+qH<=y<=7RJ3q>Wr#?d(~ck-)fF&
zd1GE#6!bciW3NFMavfwi%A$Z*K>M-B-!wk9yn}JJ;fYvAKYamBBD0ANZ1@WFv4YLD
z3tz>ZvT@)t{1#tHd^n~-qdjOd(b-sqi9XWpP+Fr)^4k{g?~uRQ>PjwVUyxwMJ<)4K
zDf|7R+@56M{8*4*6rqmzAqSO>y<`9<C_1+32~{?X!ZlO@H_r~v(+7t1Qxa3d_&$)u
zXjA2UZ~!R}sg^t9?;60#DtC}&NeSb$OIszc`xA$pnm1xZujkn$e_>UeO`zjxkO?ZY
z9WmD}=){{k)72rcXgyi~48SZxGzn8SJQmF?Bs@Hq%18rb2|O2VGO)TmdK88h;W%<U
z5+y^;;{?6pdR$`WKD^g*dZ7-a<S(yU&<oSvc_91VAtq;T?R_?Kg5tfxqgnIJ3eC$S
z=AaP~D=ynZd5Q`;U@#|isJvM@BQQq!ghvMFDOWyTXO@Jodb2dtd-S)ed}h8otFDn&
zkRQz-2w{N`82)>d)dqR`-m~n?SO3+Wlk2?0|ESp~-Qi&2m|23SE;yT!{vf1}0Rz|R
zV0WFSvx@0I48JVwMzTgih-<}X?`|NBTjlk~%n#NNdoL%~(fwtqSo<EX3yk&c7@%Bj
zD)!gc3CA|f3o-piD|E4UCI{}YjhatBjEwb^qUbOBixT!h?Dyve)5`Yfcl}C#W+3gJ
zP<VO_+GxB_o@a=(jO|#K^!DTc2u`34MFIO*CY=Cm$drV=-L{&p34mbpExH@Tyebih
z(;G~@3}E2T$|A_WKplrI2>T((3*s09v@NqkIlrRE0{<^M=Fr{|WWi*_{9D8~L&S;y
zCmRcY6R5zg;o6FqTBLs*i!DOYM9flZnNue)X36jRY*L0P*NQTMzp5SR@Lzc@W#3pD
zT))~i;z4L2Ub`rh(eM~oOLq%iV@tnVjeW{=aCn-urZTRYu3AYPS2ZxolM8MdhZ;4i
zja;ckn=Rl*re7ndTnq;Ofrzaq^8@&%Lhb~My%gyV;a8*9bI%yQ;>{dsSdvI)W64RU
zGYON04;URrwLvnbgpUTQu~0Rw*I(dJuG-7~HLv~XTtRuLAE8$54S<;E(7{9>3@fyv
z=##wLj!U?CHnO<S!Jif~EDC>320mNkDCT6qHzkr82N002|55aGPlnr_IG5me8kM3I
zt<Ly1=dXJ*FkA&mEkv+ISrUj|V?05>Rsphfa458%Eo;yd^rR}-m8M$oQ++7bDMu#(
zkJ{<Lz*5-aZnNKac#WEVcdq^zSOX$EBPLo87N%oCPp5*K4JO|vLuV|9i$Y)MOo~07
zK^SL**N7hItQJjwQ;=Ms3b0P3DfxXAEXM}u9tvWCL%3=1i5I<R!!6Fd0h^2Vfmpf&
z^Td3Ykkjls9jg<^I}RQeDh2v#l-YnLHPBZaNEhvF0<lrkD{JZuDP6R5Pq<TEX9u0=
zNXJ_j0G*wC!*81FkTGBT_??<DqJ8ZqXO2|UTsJ+vc_r-}?Y7n3#`ZK%@^6HG!~dqN
zHrd%2c0>Uii_8RPv@6NRw!_4D+&o8C#F+jwl4kEy7k<?{r7`(oW8E2_<OSe$V@|=(
zcgsK~Gp-|i$8QJU_Za*~11kJE2S9$enS*A%o%UAD|FsV$@Us{E{k2*R`1B`lyANOG
zA#nq}Q?~wYH4<{3I`xEp+E%m<n(>7VTcdW4$P}4B06#M2Z_B*&g{dI*-a`mm$sIq^
zHv&iSR#M%<ONZB{|Mo9>wC#Q*EX*GG@xCs)bWfrG@us}+U3sZ{#EgaQ8-B}MCY|U5
zcVs-UX`@`Uq%ZTXkp)R_9<j=L$`<Vax%Wj40JvsC2his+00%Hh(LH*h0{;I8Fh8*R

diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/main.cpp
deleted file mode 100644
index 7611ce545a..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/main.cpp
+++ /dev/null
@@ -1,486 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-
-#include "util.h"
-#include "OpenCL_common.h"
-
-#define BLOCK_X         14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#define UNROLL 16
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  char oclOverhead[] = "OCL Overhead";
-  char prescans[] = "PreScanKernel";
-  char postpremems[] = "PostPreMems";
-  char intermediates[] = "IntermediatesKernel";
-  char mains[] = "MainKernel";
-  char finals[] = "FinalKernel";
-
- //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-  unsigned int lmemKB;
-  unsigned int nThreads;
-  unsigned int bins_per_block;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  pb_InitializeTimerSet(&timers);
-
-  pb_AddSubTimer(&timers, oclOverhead, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, postpremems, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-
-  cl_int ciErrNum;
-  int deviceType = CL_DEVICE_TYPE_CPU;
-  cl_device_id clDevice;
-  cl_context clContext;
-  cl_command_queue clCommandQueue;
-
-  cl_program clProgram[4];
-  //cl_program clProgram;
-
-  cl_kernel histo_prescan_kernel;
-  cl_kernel histo_intermediates_kernel;
-  cl_kernel histo_main_kernel;
-  cl_kernel histo_final_kernel;
-
-  int even_width = ((img_width+1)/2)*2;
-
-  cl_mem input;
-  cl_mem ranges;
-  cl_mem sm_mappings;
-  cl_mem global_subhisto;
-  cl_mem global_histo;
-  cl_mem global_overflow;
-  cl_mem final_histo;
-
-
-  cl_uint numPlatforms;
-  OCL_ERRCK_VAR( clGetPlatformIDs(0, NULL, &numPlatforms));
-
-  cl_platform_id clPlatform[numPlatforms];
-  OCL_ERRCK_VAR( clGetPlatformIDs(numPlatforms, clPlatform, NULL));
-
-
-  OCL_ERRCK_VAR( clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL));
-
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-   
-  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  long unsigned int lmemSize = 0;
-  OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) );
-
-  // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming
-
-  if (lmemSize >= 48*1024) {
-    lmemKB = 48;
-  } else if (lmemSize >= 24*1024) {
-    lmemKB = 24;
-  } else {
-    lmemKB = 8;
-  }
-
-  lmemKB = 24;
-
-  bins_per_block = lmemKB * 1024;
-
-  switch (lmemKB) {
-    case 48: nThreads = 1024; break;
-    case 24: nThreads = 768; break;
-    default: nThreads = 512; break;
-  }
-
-
-
-  size_t program_length[4];
-  //size_t program_length;
-  const char *source_path[4] = { "src/opencl_cpu_baseline/histo_prescan.cl",
-    "src/opencl_cpu_baseline/histo_intermediates.cl", "src/opencl_cpu_baseline/histo_main.cl","src/opencl_opencl_cpu_baseline/histo_final.cl"};
-  //const char *source_path = { "src/opencl_nvidia/kernel.cl"};
-  char *source[4];
-  //char *source;
-
-  for (int i = 0; i < 4; ++i) {
-    //Dynamically allocate buffer for source
-    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
-    if(!source) {
-      fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-
-  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-
-  	free(source[i]);
-  }
-
-
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D PRESCAN_THREADS=%u\
-                -D KB=%u -D UNROLL=%u\
-                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",
-
-                PRESCAN_THREADS,
-                lmemKB, UNROLL,
-                bins_per_block, BLOCK_X
-            );
-
-  for (int i = 0; i < 4; ++i) {
-//fprintf(stderr, "Building Program #%d...\n", i);
-    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) );
-  // Get program binary
-  // Query binary (PTX file) size
-    //size_t bin_sz;
-    //ciErrNum = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL);
-    //OCL_ERRCK_VAR(ciErrNum);
- 
-    //Read binary (PTX file) to memory buffer
-    //unsigned char *bin = (unsigned char *)malloc(bin_sz);
-    //ciErrNum = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
-    //OCL_ERRCK_VAR(ciErrNum);
- 
-    //Save PTX to add_vectors_ocl.ptx
-    //FILE* fp = fopen("histo.nvptx.s", "wb");
-    //fwrite(bin, sizeof(char), bin_sz, fp);
-    //fclose(fp);
-    //free(bin); 
-          /*
-       char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-       */
-  }
-
-  histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  input = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      2*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-
-  // Must dynamically allocate. Too large for stack
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) malloc(sizeof(unsigned int) *img_width*histo_height);
-  if (zeroData == NULL) {
-    fprintf(stderr, "Failed to allocate %ld bytes of memory!\n", sizeof(unsigned int) * img_width * histo_height);
-    exit(1);
-  }
-  memset(zeroData, 0, img_width*histo_height*sizeof(unsigned int));
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  for (int y=0; y < img_height; y++){
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_FALSE,
-                          y*even_width*sizeof(unsigned int), // Offset in bytes
-                          img_width*sizeof(unsigned int), // Size of data to write
-                          &img[y*img_width], // Host Source
-                          0, NULL, NULL) );
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  unsigned int img_dim = img_height*img_width;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) );
-
-  unsigned int half_width = (img_width+1)/2;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) );
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) );
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) );
-
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) );
-
-  size_t prescan_localWS[1] = {PRESCAN_THREADS};
-  size_t prescan_globalWS[1] = {PRESCAN_BLOCKS_X*prescan_localWS[0]};
-  size_t inter_localWS[1] = {(img_width+1)/2};
-  size_t inter_globalWS[1] = {((img_height + UNROLL-1)/UNROLL) * inter_localWS[0]};
-  size_t main_localWS[2] = {nThreads, 1};
-  size_t main_globalWS[2];  main_globalWS[0] = BLOCK_X * main_localWS[0];
-  size_t final_localWS[1] = {512};
-  size_t final_globalWS[1] = {BLOCK_X*3 * final_localWS[0]};
-
-
-  //pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-
-  for (int iter = 0; iter < numIterations; iter++) {
-    printf("Iteration = %d\n", iter);
-    unsigned int ranges_h[2] = {UINT32_MAX, 0};
-
-    // how about something like
-    // __global__ unsigned int ranges[2];
-    // ...kernel
-    // __shared__ unsigned int s_ranges[2];
-    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
-    // __syncthreads();
-
-    // Although then removing the blocking cudaMemcpy's might cause something about
-    // concurrent kernel execution.
-    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE,
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to write
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-
-  //pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0,
-                            prescan_globalWS, prescan_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_FALSE,
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to read
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE,
-                          0, // Offset in bytes
-                          img_width*histo_height*sizeof(unsigned int), // Size of data to write
-                          zeroData, // Host Source
-                          0, NULL, NULL) );
-
-  //pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel, 1, 0,
-                            inter_globalWS, inter_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  main_globalWS[1] = ranges_h[1]-ranges_h[0]+1;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) );
-
-  //pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0,
-                            main_globalWS, main_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);
-  pb_SwitchToSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
-                            final_globalWS, final_localWS, 0, 0, 0) );
-  clFinish(clCommandQueue);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE,
-                          0, // Offset in bytes
-                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
-                          histo, // Host Source
-                          0, NULL, NULL) );
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
-  //OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) );
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-
-
-
-  free(zeroData);
-  free(img);
-  free(histo);
-
-  //pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_FreeParameters(parameters);
-
-  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
-  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );
-
-  pb_DestroyTimerSet(&timers);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.cpp
deleted file mode 100644
index 266462c936..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.h
deleted file mode 100644
index 8db501970c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_cpu_baseline/util.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef __HISTO_UTIL_H_
-#define __HISTO_UTIL_H_
-
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/Makefile
deleted file mode 100644
index 4295907c0d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=util.o main.o OpenCL_common.o
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.cpp
deleted file mode 100644
index 9bb2c1b5b2..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    //fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      //char devName[128];
-      //OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_NAME, sizeof(char)*128, devName, NULL));
-      //fprintf(stderr, "Device #%d Name: %s\n", id, devName);
-      
-      if (reqDeviceType != NULL) {
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	cl_uint maxMemAlloc = 0;
-	
-	OCL_ERRCK_RETVAL ( clGetDeviceInfo(	device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-
-	
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.h
deleted file mode 100644
index c51800532d..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/OpenCL_common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_final.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_final.cl
deleted file mode 100644
index f2e582deda..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_final.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-/* Combine all the sub-histogram results into one final histogram */
-__kernel void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    __global unsigned int *global_subhisto,
-    __global unsigned int *global_histo,
-    __global unsigned int *global_overflow,
-    __global unsigned int *final_histo) //final output
-{
-    unsigned int blockDimx = get_local_size(0);
-    unsigned int gridDimx = get_num_groups(0);
-    unsigned int start_offset = get_local_id(0) + get_group_id(0) * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((__global uint4*)global_overflow)[i];
-        ((__global uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = (uint4)(
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((__global unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_intermediates.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_intermediates.cl
deleted file mode 100644
index 509f8dfc05..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_intermediates.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-__kernel void calculateBin (
-        __const unsigned int bin,
-        __global uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__kernel void histo_intermediates_kernel (
-        __global uint2 *input,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_main.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_main.cl
deleted file mode 100644
index 808daf56e6..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_main.cl
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-void testIncrementGlobal (
-        __global unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atom_add (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-void testIncrementLocal (
-        __global unsigned int *global_overflow,
-        __local unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atom_add (&smem[indexhi][indexlo], add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atom_add (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atom_add (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atom_add (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atom_add (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-void clearMemory (__local unsigned int smem[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK / 4; i += blockDimx)
-        {
-                ((__local unsigned int*)smem)[i] = 0;
-        }
-}
-
-void copyMemory (__global unsigned int *dst, __local unsigned int src[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK/4; i += blockDimx)
-        {
-                dst[i] = ((__local unsigned int*)src)[i];
-        }
-}
-
-__kernel void histo_main_kernel (
-        __global uchar4 *sm_mappings,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        __global unsigned int *global_subhisto,
-        __global unsigned int *global_histo,
-        __global unsigned int *global_overflow)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        __local unsigned int sub_histo[KB][256];
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int blockDimx = get_local_size(0);
-        unsigned int gridDimx = get_num_groups(0);
-        unsigned int local_scan_range = sm_range_min + get_group_id(1);
-        unsigned int local_scan_load = get_group_id(0) * blockDimx + get_local_id(0);
-
-        clearMemory (sub_histo);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (get_group_id(1) == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = get_group_id(0) * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_prescan.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_prescan.cl
deleted file mode 100644
index c1f85a5eec..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/histo_prescan.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-
-__kernel void histo_prescan_kernel (__global unsigned int* input, int size, __global unsigned int* minmax)
-{
-
-    __local float Avg[PRESCAN_THREADS];
-    __local float StdDev[PRESCAN_THREADS];
-
-    int threadIdxx = get_local_id(0);
-    int blockDimx = get_local_size(0);
-    int blockIdxx = get_group_id(0);
-    int stride = size/(get_num_groups(0));
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	barrier(CLK_LOCAL_MEM_FENCE);
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atom_min(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atom_max(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }  
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/kernel.cl b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/kernel.cl
deleted file mode 100644
index afcd77e192..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/kernel.cl
+++ /dev/null
@@ -1,456 +0,0 @@
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-__kernel void histo_prescan_kernel (__global unsigned int* input, long  b1,
-    int size,
-    __global unsigned int* minmax, long b2,
-    __local float* Avg, long b3,
-    __local float* StdDev, long b4)
-{
-
-    /*__local float Avg[PRESCAN_THREADS];*/
-    /*__local float StdDev[PRESCAN_THREADS];*/
-
-    int threadIdxx = get_local_id(0);
-    int blockDimx = get_local_size(0);
-    int blockIdxx = get_group_id(0);
-    int stride = size/(get_num_groups(0));
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end){
-        avg += input[addr];
-        count++;
-	addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end){
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)    
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1){
-	barrier(CLK_LOCAL_MEM_FENCE);
-	SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0){
-        float avg = Avg[0]+Avg[1];
-	avg /= PRESCAN_THREADS;
-	float stddev = StdDev[0]+StdDev[1];
-	stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-	    atom_min(minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        atom_max(minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }  
-}
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-__kernel void calculateBin (
-        __const unsigned int bin,
-        __global uchar4 *sm_mapping)
-{
-        unsigned char offset  =  bin        %   4;
-        unsigned char indexlo = (bin >>  2) % 256;
-        unsigned char indexhi = (bin >> 10) %  KB;
-        unsigned char block   =  bin / BINS_PER_BLOCK;
-
-        offset *= 8;
-
-        uchar4 sm;
-        sm.x = block;
-        sm.y = indexhi;
-        sm.z = indexlo;
-        sm.w = offset;
-
-        *sm_mapping = sm;
-}
-
-__kernel void histo_intermediates_kernel (
-        __global uint2 *input, long b1,
-        unsigned int height,
-        unsigned int width,
-        unsigned int input_pitch,
-        __global uchar4 *sm_mappings, long b2)
-{
-        int threadIdxx = get_local_id(0);
-        int blockDimx = get_local_size(0);
-        unsigned int line = UNROLL * (get_group_id(0));// 16 is the unroll factor;
-
-        __global uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-        unsigned int store = line * width + threadIdxx;
-        bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-        #pragma unroll
-        for (int i = 0; i < UNROLL; i++)
-        {
-                uint2 bin_value = *load_bin;
-
-                calculateBin (
-                        bin_value.x,
-                        &sm_mappings[store]
-                );
-
-                if (!skip) calculateBin (
-                        bin_value.y,
-                        &sm_mappings[store + blockDimx]
-                );
-
-                load_bin += input_pitch;
-                store += width;
-        }
-}
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-void testIncrementGlobal (
-        __global unsigned int *global_histo,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are outside the central region of histogram */
-        if (range < sm_range_min || range > sm_range_max)
-        {
-                const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-                const unsigned int bin_div2 = bin / 2;
-                const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-                unsigned int old_val = global_histo[bin_div2];
-                unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-                if (old_bin < 255)
-                {
-                        atom_add (&global_histo[bin_div2], 1 << bin_offset);
-                }
-        }
-}
-
-void testIncrementLocal (
-        __global unsigned int *global_overflow,
-        __local unsigned int smem[KB][256],
-        const unsigned int myRange,
-        const uchar4 sm)
-{
-        const unsigned int range = sm.x;
-        const unsigned int indexhi = sm.y;
-        const unsigned int indexlo = sm.z;
-        const unsigned int offset  = sm.w;
-
-        /* Scan for inputs that are inside the central region of histogram */
-        if (range == myRange)
-        {
-                /* Atomically increment shared memory */
-                unsigned int add = (unsigned int)(1 << offset);
-                unsigned int prev = atom_add (&smem[indexhi][indexlo], add);
-
-                /* Check if current bin overflowed */
-                unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-                /* If there was an overflow, record it and record if it cascaded into other bins */
-                if (prev_bin_val == 0x000000FF)
-                {
-                        const unsigned int bin =
-                                range * BINS_PER_BLOCK +
-                                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-                        bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-                        bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-                        bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-                        bool overflow_into_bin_plus_1 = false;
-                        bool overflow_into_bin_plus_2 = false;
-                        bool overflow_into_bin_plus_3 = false;
-
-                        unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-                        unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-                        unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-                        if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-                        if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-                        if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-                        unsigned int bin_plus_1_add;
-                        unsigned int bin_plus_2_add;
-                        unsigned int bin_plus_3_add;
-
-                        if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-                        if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-                                                      atom_add (&global_overflow[bin],   256);
-                        if (overflow_into_bin_plus_1) atom_add (&global_overflow[bin+1], bin_plus_1_add);
-                        if (overflow_into_bin_plus_2) atom_add (&global_overflow[bin+2], bin_plus_2_add);
-                        if (overflow_into_bin_plus_3) atom_add (&global_overflow[bin+3], bin_plus_3_add);
-                }
-        }
-}
-
-void clearMemory (__local unsigned int smem[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK / 4; i += blockDimx)
-        {
-                ((__local unsigned int*)smem)[i] = 0;
-        }
-}
-
-void copyMemory (__global unsigned int *dst, __local unsigned int src[KB][256])
-{
-        for (int i = get_local_id(0), blockDimx = get_local_size(0); i < BINS_PER_BLOCK/4; i += blockDimx)
-        {
-                dst[i] = ((__local unsigned int*)src)[i];
-        }
-}
-
-#define sub_histo(x,y) sub_histo[(x)*256+(y)]
-__kernel void histo_main_kernel (
-        __global uchar4 *sm_mappings, long b1,
-        unsigned int num_elements,
-        unsigned int sm_range_min,
-        unsigned int sm_range_max,
-        unsigned int histo_height,
-        unsigned int histo_width,
-        __global unsigned int *global_subhisto, long b3,
-        __global unsigned int *global_histo, long b4,
-        __global unsigned int *global_overflow, long b5,
-        __local unsigned int* sub_histo, long b6)
-{
-        /* Most optimal solution uses 24 * 1024 bins per threadblock */
-        /*__local unsigned int sub_histo[KB][256];*/
-
-        /* Each threadblock contributes to a specific 24KB range of histogram,
-         * and also scans every N-th line for interesting data.  N = gridDim.x
-         */
-        unsigned int blockDimx = get_local_size(0);
-        unsigned int gridDimx = get_num_groups(0);
-        unsigned int local_scan_range = sm_range_min + get_group_id(1);
-        unsigned int local_scan_load = get_group_id(0) * blockDimx + get_local_id(0);
-
-        clearMemory (sub_histo);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (get_group_id(1) == 0)
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                        testIncrementGlobal (
-                                global_histo,
-                                sm_range_min,
-                                sm_range_max,
-                                sm
-                        );
-                }
-        }
-        else
-        {
-                /* Loop through and scan the input */
-                while (local_scan_load < num_elements)
-                {
-                        /* Read buffer */
-                        uchar4 sm = sm_mappings[local_scan_load];
-                        local_scan_load += blockDimx * gridDimx;
-
-                        /* Check input */
-                        testIncrementLocal (
-                                global_overflow,
-                                sub_histo,
-                                local_scan_range,
-                                sm
-                        );
-                }
-        }
-
-        /* Store sub histogram to global memory */
-        unsigned int store_index = get_group_id(0) * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        copyMemory (&(global_subhisto[store_index]), sub_histo);
-}
-
-
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-/* Combine all the sub-histogram results into one final histogram */
-__kernel void histo_final_kernel (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    __global unsigned int *global_subhisto, long b1,
-    __global unsigned int *global_histo, long b2,
-    __global unsigned int *global_overflow, long b3,
-    __global unsigned int *final_histo, long b4) //final output
-{
-    unsigned int blockDimx = get_local_size(0);
-    unsigned int gridDimx = get_num_groups(0);
-    unsigned int start_offset = get_local_id(0) + get_group_id(0) * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((__global uint4*)global_overflow)[i];
-        ((__global uint4*)global_overflow)[i] = zero_int;
-
-        uint4 internal_histo_data = (uint4)(
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((__global unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            internal_histo_data.x,
-            internal_histo_data.y,
-            internal_histo_data.z,
-            internal_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((__global ushort4*)global_histo)[i];
-        ((__global ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) (
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        );
-
-        ((__global uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/main.cpp
deleted file mode 100644
index 8aa8f3f3ca..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/main.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-
-#include "util.h"
-#include "OpenCL_common.h"
-
-#define BLOCK_X         14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#define UNROLL 16
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  char oclOverhead[] = "OCL Overhead";
-  char prescans[] = "PreScanKernel";
-  char postpremems[] = "PostPreMems";
-  char intermediates[] = "IntermediatesKernel";
-  char mains[] = "MainKernel";
-  char finals[] = "FinalKernel";
-
- //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int numIterations;
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-  unsigned int lmemKB;
-  unsigned int nThreads;
-  unsigned int bins_per_block;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  pb_InitializeTimerSet(&timers);
-
-  pb_AddSubTimer(&timers, oclOverhead, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, postpremems, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-  pb_AddSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  int deviceType = CL_DEVICE_TYPE_GPU;
-  cl_device_id clDevice;
-  cl_context clContext;
-  cl_command_queue clCommandQueue;
-
-  cl_program clProgram[4];
-  //cl_program clProgram;
-
-  cl_kernel histo_prescan_kernel;
-  cl_kernel histo_intermediates_kernel;
-  cl_kernel histo_main_kernel;
-  cl_kernel histo_final_kernel;
-
-  int even_width = ((img_width+1)/2)*2;
-
-  cl_mem input;
-  cl_mem ranges;
-  cl_mem sm_mappings;
-  cl_mem global_subhisto;
-  cl_mem global_histo;
-  cl_mem global_overflow;
-  cl_mem final_histo;
-
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(1, &clPlatform, NULL) );
-  OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, deviceType, 1, &clDevice, NULL) );
-
-  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  long unsigned int lmemSize = 0;
-  OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) );
-
-  // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming
-
-  if (lmemSize >= 48*1024) {
-    lmemKB = 48;
-  } else if (lmemSize >= 24*1024) {
-    lmemKB = 24;
-  } else {
-    lmemKB = 8;
-  }
-
-  lmemKB = 24;
-
-  bins_per_block = lmemKB * 1024;
-
-  switch (lmemKB) {
-    case 48: nThreads = 1024; break;
-    case 24: nThreads = 768; break;
-    default: nThreads = 512; break;
-  }
-
-
-
-  size_t program_length[4];
-  //size_t program_length;
-  const char *source_path[4] = { "src/opencl_nvidia/histo_prescan.cl",
-    "src/opencl_nvidia/histo_intermediates.cl", "src/opencl_nvidia/histo_main.cl","src/opencl_nvidia/histo_final.cl"};
-  //const char *source_path = { "src/opencl_nvidia/kernel.cl"};
-  char *source[4];
-  //char *source;
-
-  for (int i = 0; i < 4; ++i) {
-    //Dynamically allocate buffer for source
-    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
-    if(!source) {
-      fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-
-  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-
-  	free(source[i]);
-  }
-
-
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D PRESCAN_THREADS=%u\
-                -D KB=%u -D UNROLL=%u\
-                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",
-
-                PRESCAN_THREADS,
-                lmemKB, UNROLL,
-                bins_per_block, BLOCK_X
-            );
-
-  for (int i = 0; i < 4; ++i) {
-//fprintf(stderr, "Building Program #%d...\n", i);
-    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) );
-  // Get program binary
-  // Query binary (PTX file) size
-    //size_t bin_sz;
-    //ciErrNum = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL);
-    //OCL_ERRCK_VAR(ciErrNum);
- 
-    //Read binary (PTX file) to memory buffer
-    //unsigned char *bin = (unsigned char *)malloc(bin_sz);
-    //ciErrNum = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
-    //OCL_ERRCK_VAR(ciErrNum);
- 
-    //Save PTX to add_vectors_ocl.ptx
-    //FILE* fp = fopen("histo.nvptx.s", "wb");
-    //fwrite(bin, sizeof(char), bin_sz, fp);
-    //fclose(fp);
-    //free(bin); 
-          /*
-       char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-       */
-  }
-
-  histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  input = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      2*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
-      img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-
-  // Must dynamically allocate. Too large for stack
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) malloc(sizeof(unsigned int) *img_width*histo_height);
-  if (zeroData == NULL) {
-    fprintf(stderr, "Failed to allocate %ld bytes of memory!\n", sizeof(unsigned int) * img_width * histo_height);
-    exit(1);
-  }
-  memset(zeroData, 0, img_width*histo_height*sizeof(unsigned int));
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  for (int y=0; y < img_height; y++){
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_FALSE,
-                          y*even_width*sizeof(unsigned int), // Offset in bytes
-                          img_width*sizeof(unsigned int), // Size of data to write
-                          &img[y*img_width], // Host Source
-                          0, NULL, NULL) );
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  unsigned int img_dim = img_height*img_width;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) );
-
-  unsigned int half_width = (img_width+1)/2;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) );
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) );
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) );
-
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) );
-
-  size_t prescan_localWS[1] = {PRESCAN_THREADS};
-  size_t prescan_globalWS[1] = {PRESCAN_BLOCKS_X*prescan_localWS[0]};
-  size_t inter_localWS[1] = {(img_width+1)/2};
-  size_t inter_globalWS[1] = {((img_height + UNROLL-1)/UNROLL) * inter_localWS[0]};
-  size_t main_localWS[2] = {nThreads, 1};
-  size_t main_globalWS[2];  main_globalWS[0] = BLOCK_X * main_localWS[0];
-  size_t final_localWS[1] = {512};
-  size_t final_globalWS[1] = {BLOCK_X*3 * final_localWS[0]};
-
-
-  //pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-
-  for (int iter = 0; iter < numIterations; iter++) {
-    unsigned int ranges_h[2] = {UINT32_MAX, 0};
-
-    // how about something like
-    // __global__ unsigned int ranges[2];
-    // ...kernel
-    // __shared__ unsigned int s_ranges[2];
-    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
-    // __syncthreads();
-
-    // Although then removing the blocking cudaMemcpy's might cause something about
-    // concurrent kernel execution.
-    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE,
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to write
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-
-  //pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0,
-                            prescan_globalWS, prescan_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_FALSE,
-                          0, // Offset in bytes
-                          2*sizeof(unsigned int), // Size of data to read
-                          ranges_h, // Host Source
-                          0, NULL, NULL) );
-
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE,
-                          0, // Offset in bytes
-                          img_width*histo_height*sizeof(unsigned int), // Size of data to write
-                          zeroData, // Host Source
-                          0, NULL, NULL) );
-
-  //pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel, 1, 0,
-                            inter_globalWS, inter_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  main_globalWS[1] = ranges_h[1]-ranges_h[0]+1;
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) );
-
-  //pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  pb_SwitchToSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0,
-                            main_globalWS, main_localWS, 0, 0, 0) );
-
-  clFinish(clCommandQueue);
-  //pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);
-  pb_SwitchToSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
-                            final_globalWS, final_localWS, 0, 0, 0) );
-  clFinish(clCommandQueue);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE,
-                          0, // Offset in bytes
-                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
-                          histo, // Host Source
-                          0, NULL, NULL) );
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
-  //OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) );
-  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) );
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-
-
-
-  free(zeroData);
-  free(img);
-  free(histo);
-
-  //pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_FreeParameters(parameters);
-
-  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
-  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );
-
-  pb_DestroyTimerSet(&timers);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.cpp b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.cpp
deleted file mode 100644
index 266462c936..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.h b/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.h
deleted file mode 100644
index 8db501970c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/opencl_nvidia/util.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef __HISTO_UTIL_H_
-#define __HISTO_UTIL_H_
-
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.c b/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.c
deleted file mode 100644
index 28c31df111..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.c
+++ /dev/null
@@ -1,271 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <alloca.h>
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <pthread.h>
-
-#include "FauxBlock.h"
-
-typedef void *(*pthrd_start_func) (void *);    // Used by pthreads as the thread func signature
-
-//=====================================================
-//=====================================================
-//=====================================================
-//=====================================================
-
-#include <stdlib.h>
-
-typedef void (*exec_block_func) (faux_block_t *);
-
-typedef struct async_runner
-{
-             pthread_t        t;
-             pthread_mutex_t  m;
-             pthread_cond_t   go;
-    volatile unsigned int     ready;
-
-             pthread_mutex_t  m2;
-             pthread_cond_t   done;
-
-    volatile exec_block_func   f;
-    volatile void *           d;
-} async_runner_t;
-
-
-
-#define MAX_CACHED_RUNNERS 12 
-static async_runner_t* runner_stack[MAX_CACHED_RUNNERS];
-static int next = -1;
-
-
-static void dispose_runner(async_runner_t *r)
-{
-    pthread_mutex_destroy (&(r->m));
-    pthread_mutex_destroy (&(r->m2));
-    pthread_cond_destroy  (&(r->go));
-    pthread_cond_destroy  (&(r->done));
-    pthread_detach((r->t));
-
-    free (r);
-}
-
-/**
- * This is the thread code that loops forever (more or less) in the
- * runner threads.
- */
-static void *thread_pool_thread(void *data)
-{
-    async_runner_t* r = (async_runner_t*)data;
-
-    exec_block_func  f;
-    void*            d;
-
-    pthread_mutex_lock (&(r->m));                   // Lock
-    do
-    {
-        // Only wait if we don't have ready signaled.  If we don't
-        // have the protective 'if', we can lose signals and hang.
-        if (r->ready == 0)
-            pthread_cond_wait(&(r->go), &(r->m));       // Wait ... mutex unlocked inside, locked before return ...
-
-        f = r->f;
-        if (f == 0)
-            goto done;
-
-        d = (void*)(r->d);
-
-        (*f)((faux_block_t*)d);
-
-        r->f = 0;
-        r->d = 0;
-        r->ready = 0;
-        // Signal that we are done.  The join() method cares ...
-        pthread_mutex_lock(&(r->m2));
-        pthread_cond_signal(&(r->done));
-        pthread_mutex_unlock(&(r->m2));
-    } while (1);
-
-done:
-    dispose_runner(r);
-
-    pthread_exit(0);
-    return 0;
-}
-
-static pthread_mutex_t runner_pool_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-/**
- * Block until the runner thread has finished handling its current task.
- */
-static void join (async_runner_t *runner)
-{
-    // Wait (if necessary) for the runner thread to tell us it is finished
-    // running its current task.
-
-    pthread_mutex_lock(&(runner->m2));
-    if (runner->f)
-        pthread_cond_wait(&(runner->done), &(runner->m2));
-    pthread_mutex_unlock(&(runner->m2));
-
-    // Get back the mutex from the worker...
-    pthread_mutex_lock (&(runner->m));
-
-    pthread_mutex_lock(&runner_pool_mutex);
-    // Return the runnable to the stack of available runnables.
-    if (next < (MAX_CACHED_RUNNERS - 1))
-    {
-        ++next;
-        runner_stack[next] = runner;
-        pthread_mutex_unlock(&runner_pool_mutex);
-    }
-    else
-    {
-        pthread_mutex_unlock(&runner_pool_mutex);
-
-        // "Run" this again, with func as null (which is how the thread left things).
-        // With a null func, the thread will exit, which is what we want since we
-        // can't access it after we fail to put it back on the stack.
-
-        pthread_cond_signal(&(runner->go));
-        pthread_mutex_unlock (&(runner->m));          // Allow the worker thread to run
-    }
-}
-
-
-
-
-
-// async_runner_t* run_async (pthrd_start_func func, void *data)
-static async_runner_t* run_async (exec_block_func func, void *data)
-{
-    async_runner_t *r = 0;
-
-    pthread_mutex_lock(&runner_pool_mutex);
-    if (next >= 0)
-    {
-        r = runner_stack[next];
-        --next;
-    }
-    pthread_mutex_unlock(&runner_pool_mutex);
-
-    if (r == 0)
-    {
-        r = (async_runner_t*)malloc (sizeof(async_runner_t));
-
-        pthread_attr_t attr;
-        pthread_attr_init(&attr);
-        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-        pthread_mutex_init (&(r->m), 0);
-        pthread_cond_init  (&(r->go), 0);
-
-        pthread_mutex_init (&(r->m2), 0);
-        pthread_cond_init  (&(r->done), 0);
-        r->ready = 1;
-        pthread_mutex_lock (&(r->m));
-
-       pthread_create(&(r->t), &attr, thread_pool_thread, r);
-    }
-
-    r->f = func;
-    r->d = data;
-
-    // Already locked ... either from the block above, or from the join() method
-    pthread_cond_signal(&(r->go));
-    pthread_mutex_unlock (&(r->m));          // Allow the worker thread to run
-
-    return r;
-}
-//======================
-//======================
-//======================
-//======================
-
-
-/**
- *
- */
-void clear_runners()
-{
-    while (next >= 0)
-    {
-        async_runner_t *r = runner_stack[next];
-        pthread_cond_signal(&(r->go));
-        pthread_mutex_unlock (&(r->m));          // Allow the worker thread to run
-        --next;
-        // dispose_runner(r);
-    }
-}
-
-
-/**
- * Execute 'block' on this thread (so don't spin a worker thread)..
- */
-void exec_faux_block (faux_block_t *block)
-{
-    block->func((void*)(block->args));
-}
-
-//## TODO: Handle call when block still assigned to other thread
-
-
-/**
- * Execute 'len' of the passed in 'block' FauxBlocks, each on its own thread.
- */
-int exec_faux_block_deferred (faux_block_t block[], size_t len)
-{
-// printf("exec_faux_block_deferred(%p, %d)\n", block, len);
-
-    size_t i;
-
-    for (i = 0; i < len; ++i)
-    {
-        async_runner_t *r = run_async (exec_faux_block, block + i);
-        block[i].tid = r; // t;       // join will want this ...
-    }
-
-    //## TODO Handle errors ...
-    return 0;
-}
-
-
-
-void *thread_pool_thread(void *data);
-/**
- * Wait until 'len' of the passed in 'block' FauxBlocks are done executing..
- */
-void faux_block_exec_join(faux_block_t block[], size_t len)
-{
-    //## TODO: Handle join w/o tid set
-
-    size_t i;
-
-    for (i = 0; i < len; ++i)
-    {
-        join ((async_runner_t*) block[i].tid);
-        block[i].tid = 0;                  // Zero out the tid ... we aren't running in a thread anymore
-    }
-}
-
-
-void faux_block_run (faux_block_t block[], size_t len)
-{
-    exec_faux_block_deferred (block, len);
-    faux_block_exec_join     (block, len);
-}
-
-
-
-void not_enough_arg_space(void *arg)
-{
-    fprintf(stderr, "Arguments to function too large.\n");    //##TODO: It would be nice to pass the function name ...
-    exit(-50);\
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.h b/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.h
deleted file mode 100644
index eb24c73bd5..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/FauxBlock.h
+++ /dev/null
@@ -1,1233 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef FAUX_BLOCK_H__
-#define FAUX_BLOCK_H__
-
-#include <string.h>
-#include <stdlib.h>
-#include <pthread.h>
-
-typedef void (*void_func_t) (void  *);
-
-#define FAUX_BLOCK_MAX_INT_ARGS (64)
-
-#define FAUX_BLOCK_FUNC 
-
-typedef struct faux_block
-{
-    void_func_t func;
-    void*       tid;
-    double      args[FAUX_BLOCK_MAX_INT_ARGS/2];     // Use an array of doubles to ensure most restrictive memory alignment
-} faux_block_t;
-
-faux_block_t make_faux_block          (void_func_t func, const void *top, const void *bottom, size_t last_size);
-void         exec_faux_block          (faux_block_t *block);
-int          exec_faux_block_deferred (faux_block_t block[], size_t len);
-void         faux_block_exec_join     (faux_block_t block[], size_t len);
-void         faux_block_run           (faux_block_t block[], size_t len);
-
-void         clear_runners            ();
-
-void not_enough_arg_space(void *arg);
-
-#define MAKE_FUNC_HELPER(name__)\
-        if (sizeof(s_) > sizeof(blk.args))\
-        {\
-            blk.func = not_enough_arg_space;\
-            blk.tid  = (void*)-1;\
-        }\
-        else\
-        {\
-            blk.func = name__ ## _struct_func;  \
-            blk.tid  = 0;\
-            memcpy(blk.args, &s_, sizeof(s_));\
-        }
-
-
-
-#define MAKE_FUNC_0_ARGS(scope__, name__)\
-    static void FAUX_BLOCK_FUNC name__ ## _func();\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func();\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block ()\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func()
-
-
-
-#define MAKE_FUNC_1_ARGS(scope__, name__, t01_, v01_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_)
-
-
-
-#define MAKE_FUNC_2_ARGS(scope__, name__, t01_, v01_, t02_, v02_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_)
-
-
-#define MAKE_FUNC_3_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_)
-
-
-#define MAKE_FUNC_4_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_)
-
-
-
-#define MAKE_FUNC_5_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_)
-
-#define MAKE_FUNC_6_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_)
-
-
-#define MAKE_FUNC_7_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                          t07_, v07_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_)
-
-
-#define MAKE_FUNC_8_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                          t07_, v07_, t08_, v08_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_)
-
-
-#define MAKE_FUNC_9_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                          t07_, v07_, t08_, v08_, t09_, v09_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_)
-
-
-#define MAKE_FUNC_10_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_)
-
-
-#define MAKE_FUNC_11_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_)
-
-
-#define MAKE_FUNC_12_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_, t12_, v12_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-        t12_ v12_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_,\
-                        arg->v12_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_,\
-                                                                    t12_ v12_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_,\
-                                                 v12_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_)
-
-
-#define MAKE_FUNC_13_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_, t12_, v12_,\
-                                           t13_, v13_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-        t12_ v12_;\
-        t13_ v13_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_,\
-                        arg->v12_,\
-                        arg->v13_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_,\
-                                                                    t12_ v12_,\
-                                                                    t13_ v13_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_,\
-                                                 v12_,\
-                                                 v13_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_)
-
-#define MAKE_FUNC_14_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_, t12_, v12_,\
-                                           t13_, v13_, t14_, v14_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-        t12_ v12_;\
-        t13_ v13_;\
-        t14_ v14_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_,\
-                        arg->v12_,\
-                        arg->v13_,\
-                        arg->v14_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_,\
-                                                                    t12_ v12_,\
-                                                                    t13_ v13_,\
-                                                                    t14_ v14_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_,\
-                                                 v12_,\
-                                                 v13_,\
-                                                 v14_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_)
-
-
-
-#define MAKE_FUNC_15_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_, t12_, v12_,\
-                                           t13_, v13_, t14_, v14_, t15_, v15_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_,\
-                                                t15_ v15_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-        t12_ v12_;\
-        t13_ v13_;\
-        t14_ v14_;\
-        t15_ v15_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_,\
-                        arg->v12_,\
-                        arg->v13_,\
-                        arg->v14_,\
-                        arg->v15_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_,\
-                                                                    t12_ v12_,\
-                                                                    t13_ v13_,\
-                                                                    t14_ v14_,\
-                                                                    t15_ v15_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_,\
-                                                 v12_,\
-                                                 v13_,\
-                                                 v14_,\
-                                                 v15_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_,\
-                                                t15_ v15_)
-
-
-#define MAKE_FUNC_16_ARGS(scope__, name__, t01_, v01_, t02_, v02_, t03_, v03_, t04_, v04_, t05_, v05_, t06_, v06_,\
-                                           t07_, v07_, t08_, v08_, t09_, v09_, t10_, v10_, t11_, v11_, t12_, v12_,\
-                                           t13_, v13_, t14_, v14_, t15_, v15_, t16_, v16_)\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_,\
-                                                t15_ v15_,\
-                                                t16_ v16_);\
-    \
-    typedef struct fox_blocks_struct_ ## name__\
-    {\
-        t01_ v01_;\
-        t02_ v02_;\
-        t03_ v03_;\
-        t04_ v04_;\
-        t05_ v05_;\
-        t06_ v06_;\
-        t07_ v07_;\
-        t08_ v08_;\
-        t09_ v09_;\
-        t10_ v10_;\
-        t11_ v11_;\
-        t12_ v12_;\
-        t13_ v13_;\
-        t14_ v14_;\
-        t15_ v15_;\
-        t16_ v16_;\
-    } fox_blocks_struct_ ## name__ ## _t;\
-    \
-    static void name__ ## _struct_func(void *a)\
-    {\
-        fox_blocks_struct_ ## name__ ## _t *arg = (fox_blocks_struct_ ## name__ ## _t*)a;\
-        name__ ## _func(arg->v01_,\
-                        arg->v02_,\
-                        arg->v03_,\
-                        arg->v04_,\
-                        arg->v05_,\
-                        arg->v06_,\
-                        arg->v07_,\
-                        arg->v08_,\
-                        arg->v09_,\
-                        arg->v10_,\
-                        arg->v11_,\
-                        arg->v12_,\
-                        arg->v13_,\
-                        arg->v14_,\
-                        arg->v15_,\
-                        arg->v16_);\
-    }\
-    \
-    scope__ faux_block_t FAUX_BLOCK_FUNC make_ ## name__ ## _block (t01_ v01_,\
-                                                                    t02_ v02_,\
-                                                                    t03_ v03_,\
-                                                                    t04_ v04_,\
-                                                                    t05_ v05_,\
-                                                                    t06_ v06_,\
-                                                                    t07_ v07_,\
-                                                                    t08_ v08_,\
-                                                                    t09_ v09_,\
-                                                                    t10_ v10_,\
-                                                                    t11_ v11_,\
-                                                                    t12_ v12_,\
-                                                                    t13_ v13_,\
-                                                                    t14_ v14_,\
-                                                                    t15_ v15_,\
-                                                                    t16_ v16_)\
-    {\
-        faux_block_t blk;\
-        fox_blocks_struct_ ## name__ ## _t s_ = {v01_,\
-                                                 v02_,\
-                                                 v03_,\
-                                                 v04_,\
-                                                 v05_,\
-                                                 v06_,\
-                                                 v07_,\
-                                                 v08_,\
-                                                 v09_,\
-                                                 v10_,\
-                                                 v11_,\
-                                                 v12_,\
-                                                 v13_,\
-                                                 v14_,\
-                                                 v15_,\
-                                                 v16_};\
-        \
-        MAKE_FUNC_HELPER(name__)\
-        \
-        return blk;\
-    }\
-    static void FAUX_BLOCK_FUNC name__ ## _func(t01_ v01_,\
-                                                t02_ v02_,\
-                                                t03_ v03_,\
-                                                t04_ v04_,\
-                                                t05_ v05_,\
-                                                t06_ v06_,\
-                                                t07_ v07_,\
-                                                t08_ v08_,\
-                                                t09_ v09_,\
-                                                t10_ v10_,\
-                                                t11_ v11_,\
-                                                t12_ v12_,\
-                                                t13_ v13_,\
-                                                t14_ v14_,\
-                                                t15_ v15_,\
-                                                t16_ v16_)
-
-
-
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/Makefile b/hpvm/test/parboil/benchmarks/histo/src/threaded/Makefile
deleted file mode 100644
index b9d26ad0e2..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=main.o util.o FauxBlock.o
-APP_CFLAGS=-msse4.1
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/threaded/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/main.c b/hpvm/test/parboil/benchmarks/histo/src/threaded/main.c
deleted file mode 100644
index dadf6c488a..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/main.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <smmintrin.h>
-#include <emmintrin.h>
-
-#include "FauxBlock.h"
-#include "util.h"
-
-/******************************************************************************
-* Implementation: Threaded
-* Details:
-* This implementations is a multi-threaded, SSE version of histogram. The
-* span of the data into the histogram is first determined, then every thread
-* takes an equal portion of the input and computes a partial histogram for it.
-* Finally all the partial copies are combined using SSE intrinsics to generate
-* the final histogram.
-******************************************************************************/
-
-MAKE_FUNC_7_ARGS(static, histo_scan2, int, rank, int, numThreads, int, img_height, int, img_width, unsigned int*, input, unsigned int*, Min, unsigned int*, Max)
-{
-    int stride = img_height/numThreads;
-    int start = rank*stride;
-    int end = (rank == (numThreads-1))? img_height-(numThreads-1)*stride : stride;
-
-    unsigned int minVar = UINT32_MAX;
-    unsigned int maxVar = 0;
-
-    int i, j;
-    for (j = start; j < start+end; ++j)
-    {
-        for (i = 0; i < img_width; ++i)
-        {
-            minVar = min(minVar,input[j*img_width+i]);
-            maxVar = max(maxVar,input[j*img_width+i]);
-        }
-    }
-
-    Min[rank] = minVar;
-    Max[rank] = maxVar;
-}
-
-MAKE_FUNC_10_ARGS(static, histo_thread2, int, rank, int, numThreads, int, img_height, int, img_width, unsigned int*, input, int, histo_height, int, histo_width, unsigned char*, bins, unsigned int*, Min, unsigned int*, Max)
-{
-    int stride     = img_height/numThreads;
-    unsigned int start = rank*stride;
-    unsigned int end   = (rank == (numThreads-1))? (start+img_height-(numThreads-1)*stride): (start+stride);
-
-    int min = Min[0]&~(15);
-    int max = (((Max[0]-min+(numThreads*16))/(numThreads*16))*(numThreads*16));
-
-    memset(bins+min, 0, max*sizeof(unsigned char));
-
-    int i, j; for (j = start; j < end; ++j)
-    {
-        for (i = 0; i < img_width; ++i)
-        {
-            const unsigned int value = input[j*img_width+i];
-            // Increment the appropriate bin, but do not roll-over the max value
-            if (bins[value] < UINT8_MAX){
-                ++bins[value];
-            }
-        }
-    }
-}
-
-MAKE_FUNC_7_ARGS(static, histo_merge2, int, rank, int, numThreads, int, outSize, __m128i*, out, unsigned char**, bins, unsigned int*, Min, unsigned int*, Max)
-{
-    int minVar = Min[0] & ~(15);
-    int stride = ((Max[0]-minVar+(numThreads*16))/(numThreads*16))*16;
-    int maxVar = minVar + stride*numThreads;
-
-    int topStride = minVar/numThreads;
-    int topStart = rank*topStride;
-    int topEnd;
-
-    int botStride = (outSize-maxVar)/numThreads;
-    int botStart = maxVar -1 + rank*botStride;
-    int botEnd;
-
-    int start = ((minVar+rank*stride)*sizeof(unsigned char))/sizeof(__m128i);
-    int end = start + (stride*sizeof(unsigned char))/sizeof(__m128i);
-
-    if (rank == numThreads-1){
-        topEnd = topStart + minVar - (numThreads-1)*topStride;
-        botEnd = botStart + (outSize-maxVar) - (numThreads-1)*botStride;
-    } else {
-        topEnd = topStart + topStride;
-        botEnd = botStart + botStride;
-    }
-
-    memset(((char*)out)+topStart, 0, (topEnd-topStart)*sizeof(char));
-    memset(((char*)out)+botStart, 0, (botEnd-botStart)*sizeof(char));
-
-    int i, j;
-    for (i= start; i < end; i++){
-        __m128i acc = _mm_load_si128(((__m128i*)(bins[0]))+i);
-        for (j=1; j<numThreads; j++){
-            __m128i b = _mm_load_si128(((__m128i*)(bins[j]))+i);
-            acc = _mm_adds_epu8(acc,b);
-        }
-        _mm_store_si128 (out+i, acc);
-    }
-}
-
-int main(int argc, char* argv[]) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  parameters = pb_ReadParameters(&argc, argv);
-  if (!parameters)
-    return -1;
-
-  if(!parameters->inpFiles[0]){
-    fputs("Input file expected\n", stderr);
-    return -1;
-  }
-
-  pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int numIterations;
-  int numThreads = 2;
-
-  if (argc >= 2){
-    numIterations = atoi(argv[1]);
-  } else {
-    fputs("Expected at least one command line argument\n", stderr);
-    return -1;
-  }
-
-  if (argc >= 3){
-    numThreads = atoi(argv[2]);
-    printf("Number of threads = %d\n", numThreads);
-  } else {
-    printf("Number of threads = %d (default)\n", numThreads);
-  }
-
-  unsigned int img_width, img_height;
-  unsigned int histo_width, histo_height;
-
-  FILE* f = fopen(parameters->inpFiles[0],"rb");
-  int result = 0;
-
-  result += fread(&img_width,    sizeof(unsigned int), 1, f);
-  result += fread(&img_height,   sizeof(unsigned int), 1, f);
-  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-  result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-  if (result != 4){
-    fputs("Error reading input and output dimensions from file\n", stderr);
-    return -1;
-  }
-
-  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-  result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-  fclose(f);
-
-  if (result != img_width*img_height){
-    fputs("Error reading input array from file\n", stderr);
-    return -1;
-  }
-
-  int new_size = (((histo_height*histo_width)+(numThreads*16-1))/(numThreads*16))*(numThreads*16);
-
-  unsigned char** bins = (unsigned char**) malloc (numThreads*sizeof(unsigned char*));
-  faux_block_t* blk  = (faux_block_t*) malloc (numThreads*sizeof(faux_block_t));
-  faux_block_t* blk1 = (faux_block_t*) malloc (numThreads*sizeof(faux_block_t));
-  faux_block_t* blk2 = (faux_block_t*) malloc (numThreads*sizeof(faux_block_t));
-  unsigned int* Min  = (unsigned int*) malloc (numThreads*sizeof(unsigned int));
-  unsigned int* Max  = (unsigned int*) malloc (numThreads*sizeof(unsigned int));
-  __m128i* out  = (__m128i*) calloc (new_size,sizeof(unsigned char));
-
-  int iter, i;
-  for (iter = 0; iter < 1000; iter++){
-    memset(histo,0,histo_height*histo_width*sizeof(unsigned char));
-    for (i = 0; i < img_width*img_height; ++i) {
-      const unsigned int value = img[i];
-      if (histo[value] < UINT8_MAX) {
-        ++histo[value];
-      }
-    }
-  }
-
-  for (i=0; i< numThreads;i++){
-    bins[i] = (unsigned char*) calloc (histo_height*histo_width,sizeof(unsigned char));
-    blk[i]  = make_histo_scan2_block(i, numThreads, img_height, img_width, img, Min, Max);
-    blk1[i] = make_histo_thread2_block(i,numThreads,img_height,img_width,img,histo_height,histo_width,bins[i],Min,Max);
-    blk2[i] = make_histo_merge2_block(i,numThreads,new_size,out,bins,Min,Max);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (iter = 0; iter < 1000; iter++) {
-    memset(out, 0, new_size);
-
-    faux_block_run (blk,numThreads);
-
-    for (i=1;i<numThreads;i++){
-      Min[0] = min(Min[0],Min[i]);
-      Max[0] = max(Max[0],Max[i]);
-    }
-
-    faux_block_run (blk1,numThreads);
-    faux_block_run (blk2,numThreads);  
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  for (i=0; i < histo_height*histo_width; i++){
-    histo[i] = ((unsigned char*)out)[i];
-  }
-
-  for (i=0; i<numThreads;i++){
-    free(bins[i]);
-  }
-  free(bins);
-  free(blk);
-  free(blk1);
-  free(blk2);
-  free(Min);
-  free(Max);
-
-  if (parameters->outFile) {
-    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  free(img);
-  free(histo);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/util.c b/hpvm/test/parboil/benchmarks/histo/src/threaded/util.c
deleted file mode 100644
index 7a43056dae..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/util.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    size_t x, y;
-    for (y = 0; y < height; ++y)
-    {
-        for (x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/threaded/util.h b/hpvm/test/parboil/benchmarks/histo/src/threaded/util.h
deleted file mode 100644
index ee4e2ce397..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/threaded/util.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#define UINT32_MAX 4294967295
-#define UINT8_MAX 255
-#define NUM_PROCS 8
-
-#define min(x,y) ((x<y)?x:y)
-#define max(x,y) ((x>y)?x:y)
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc/Makefile b/hpvm/test/parboil/benchmarks/histo/src/visc/Makefile
deleted file mode 100644
index bf10f546f0..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=util.ll
-VISC_OBJS = main.visc.ll
-APP_CFLAGS = -O3
-APP_CXXFLAGS = -O3
-
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc/bmp.h b/hpvm/test/parboil/benchmarks/histo/src/visc/bmp.h
deleted file mode 100644
index d1b7c1b562..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc/bmp.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "stdlib.h"
-
-typedef struct{
-  unsigned char B;
-  unsigned char G;
-  unsigned char R;
-} RGB;
-
-typedef struct {
-  unsigned int filesz;
-  unsigned short creator1;
-  unsigned short creator2;
-  unsigned int bmp_offset;
-} bmpfile_header_t;
-
-typedef struct {
-  unsigned int header_sz;
-  unsigned int width;
-  unsigned int height;
-  unsigned short nplanes;
-  unsigned short bitspp;
-  unsigned int compress_type;
-  unsigned int bmp_bytesz;
-  unsigned int hres;
-  unsigned int vres;
-  unsigned int ncolors;
-  unsigned int nimpcolors;
-} bmp_dib_header_t;
-
-typedef enum {
-  BI_RGB = 0,
-  BI_RLE8,
-  BI_RLE4,
-  BI_BITFIELDS,
-  BI_JPEG,
-  BI_PNG,
-} bmp_compression_method_t;
-
-typedef struct{
-  unsigned char magic[2];
-  bmpfile_header_t file_header;
-  bmp_dib_header_t dib_header;
-  unsigned int* palette;
-  void* pixel_map;
-} bmp_image;
-
-void create_bmp(RGB* bitmap, int height, int width, const char* filename){
-    bmp_image image;
-
-    int padded_width = 4*(((width*24)+31)/32);
-    padded_width -= width*sizeof(RGB);
-
-    char* pad = (char*) calloc (padded_width, sizeof(char));
-
-    image.magic[0]='B';
-    image.magic[1]='M';
-
-    image.file_header.filesz = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t) + height*width*sizeof(RGB);
-    image.file_header.creator1 = image.file_header.creator2 = 0;
-    image.file_header.bmp_offset = 2*sizeof(char) + sizeof(bmpfile_header_t) + sizeof(bmp_dib_header_t);
-
-    image.dib_header.header_sz = 40;//sizeof(bmp_dib_header_t);
-    image.dib_header.width = width;
-    image.dib_header.height = height;
-    image.dib_header.nplanes = 1;
-    image.dib_header.bitspp = 24;
-    image.dib_header.compress_type = 0;
-    image.dib_header.bmp_bytesz = width*height*sizeof(RGB);
-    image.dib_header.hres = 0;
-    image.dib_header.vres = 0;
-    image.dib_header.ncolors = 0;
-    image.dib_header.nimpcolors = 0;
-
-    FILE* out_file = fopen(filename,"wb");
-
-    fwrite(image.magic,sizeof(char),2,out_file);
-    fwrite(&(image.file_header),sizeof(char),sizeof(bmpfile_header_t),out_file);
-    fwrite(&(image.dib_header),sizeof(char),sizeof(bmp_dib_header_t),out_file);
-
-    int h;
-    for (h = height-1; h >= 0; h--){
-      fwrite(&bitmap[h*width],sizeof(RGB),width,out_file);
-      fwrite(pad,sizeof(char),padded_width,out_file);
-    }
-
-    fclose(out_file);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/histo/src/visc/main.cpp
deleted file mode 100644
index 516efc9d13..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc/main.cpp
+++ /dev/null
@@ -1,1275 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <visc.h>
-#include "util.h"
-
-#define BLOCK_X             14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#define UNROLL              16
-
-#define KB                  24
-#define BINS_PER_BLOCK      ((KB)*1024)
-
-#define min(x,y)            ((x) < (y))? (x) : (y)
-
-#define debug
-//#undef debug
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-// Prescan
-
-void PrescanAllocation (int block) {
-    // Memory shared between threadblocks
-    size_t bytes_Avg = sizeof(float)*PRESCAN_THREADS;
-    void* Avg  = __visc__malloc(bytes_Avg);
-
-    size_t bytes_StdDev = sizeof(float)*PRESCAN_THREADS;
-    void* StdDev = __visc__malloc(bytes_StdDev);
-    __visc__return(Avg, bytes_Avg, StdDev, bytes_StdDev);
-}
-
-void PrescanLeaf (unsigned int* input, size_t bytes_input,
-                  int size,
-                  unsigned int* minmax, size_t bytes_minmax,
-                  // local memory arguments
-                  float* Avg, size_t bytes_Avg,
-                  float* StdDev, size_t bytes_StdDev) {
-
-
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, input, minmax, 1, minmax);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-    int threadIdxx = lx;
-    int blockDimx = dimx;
-    int blockIdxx = gx;
-    int stride = size/gdimx;
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end) {
-        avg += input[addr];
-        count++;
-        addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end) {
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = __visc__sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1) {
-        __visc__barrier();
-        SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0) {
-        float avg = Avg[0]+Avg[1];
-        avg /= PRESCAN_THREADS;
-        float stddev = StdDev[0]+StdDev[1];
-        stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-        __visc__atomic_umin((int*)minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        __visc__atomic_umax((int*)minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }
-}
-
-void PrescanBlock (unsigned int* input, size_t bytes_input,
-                   int size,
-                   unsigned int* minmax, size_t bytes_minmax,
-                   int block) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, input, minmax, 1, minmax);
-
-    void* AllocationNode = __visc__createNode(PrescanAllocation);
-    void* PrescanLeafNode = __visc__createNode1D(PrescanLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(AllocationNode, 5, 0, 0);  // Bind block
-
-    __visc__bindIn(PrescanLeafNode, 0, 0, 0); // Bind input
-    __visc__bindIn(PrescanLeafNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(PrescanLeafNode, 2, 2, 0); // Bind size
-    __visc__bindIn(PrescanLeafNode, 3, 3, 0); // Bind minmax
-    __visc__bindIn(PrescanLeafNode, 4, 4, 0); // Bind bytes_minmax
-
-    // Create Edges
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 0, 5, 0); // Edge Avg
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 1, 6, 0); // Edge bytes_Avg
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 2, 7, 0); // Edge StdDev
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 3, 8, 0); // Edge bytes_StdDev
-
-}
-
-void PrescanRoot (unsigned int* input, size_t bytes_input,
-                  int size,
-                  unsigned int* minmax, size_t bytes_minmax,
-                  int block,
-                  int grid) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, input, minmax, 1, minmax);
-    //__visc__attributes(1, input, 1, minmax);
-
-    void* PrescanBlockNode = __visc__createNode1D(PrescanBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(PrescanBlockNode, 0, 0, 0); // Bind input
-    __visc__bindIn(PrescanBlockNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(PrescanBlockNode, 2, 2, 0); // Bind size
-    __visc__bindIn(PrescanBlockNode, 3, 3, 0); // Bind minmax
-    __visc__bindIn(PrescanBlockNode, 4, 4, 0); // Bind bytes_minmax
-    __visc__bindIn(PrescanBlockNode, 5, 5, 0); // Bind block
-
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int* input;
-    size_t bytes_input;
-    int size;
-    unsigned int* minmax;
-    size_t bytes_minmax;
-    int block;
-    int grid;
-}
-PrescanRootIn;
-
-void PrescanPackData( PrescanRootIn* args,
-                      unsigned int* input, size_t bytes_input,
-                      int size,
-                      unsigned int* minmax, size_t bytes_minmax,
-                      int block,
-                      int grid
-                    ) {
-    args->input = input;
-    args->bytes_input = bytes_input;
-    args->size = size;
-    args->minmax = minmax;
-    args->bytes_minmax = bytes_minmax;
-    args->block = block;
-    args->grid = grid;
-}
-
-// Intermediates
-typedef struct __attribute__((__packed__)) {
-    unsigned char x;
-    unsigned char y;
-    unsigned char z;
-    unsigned char w;
-}
-uchar4;
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int x;
-    unsigned int y;
-}
-uint2;
-
-__attribute__((always_inline))
-static inline void calculateBin (unsigned int bin, uchar4 *sm_mapping) {
-
-    unsigned char offset  =  bin        %   4;
-    unsigned char indexlo = (bin >>  2) % 256;
-    unsigned char indexhi = (bin >> 10) %  KB;
-    unsigned char block   =  bin / BINS_PER_BLOCK;
-
-    offset *= 8;
-
-    uchar4 sm;
-    sm.x = block;
-    sm.y = indexhi;
-    sm.z = indexlo;
-    sm.w = offset;
-
-    *sm_mapping = sm;
-}
-
-void IntermediatesLeaf( uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings) {
-
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-
-    int threadIdxx = lx;
-    int blockDimx = dimx;
-    unsigned int line = UNROLL * (gx);// 16 is the unroll factor;
-
-    uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-    unsigned int store = line * width + threadIdxx;
-    bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-#pragma unroll
-    for (int i = 0; i < UNROLL; i++)
-    {
-        uint2 bin_value = *load_bin;
-
-        calculateBin (
-            bin_value.x,
-            &sm_mappings[store]
-        );
-
-        if (!skip) calculateBin (
-                bin_value.y,
-                &sm_mappings[store + blockDimx]
-            );
-
-        load_bin += input_pitch;
-        store += width;
-    }
-}
-
-void IntermediatesBlock(uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings,
-                        int block) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* IntermediatesLeafNode = __visc__createNode1D(IntermediatesLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(IntermediatesLeafNode, 0, 0, 0); // Bind Input
-    __visc__bindIn(IntermediatesLeafNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(IntermediatesLeafNode, 2, 2, 0); // Bind height
-    __visc__bindIn(IntermediatesLeafNode, 3, 3, 0); // Bind width
-    __visc__bindIn(IntermediatesLeafNode, 4, 4, 0); // Bind input_pitch
-    __visc__bindIn(IntermediatesLeafNode, 5, 5, 0); // Bind sm_mappings
-    __visc__bindIn(IntermediatesLeafNode, 6, 6, 0); // Bind bytes_sm_mappings
-}
-
-void IntermediatesRoot( uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings,
-                        int block,
-                        int grid) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* IntermediatesBlockNode = __visc__createNode1D(IntermediatesBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(IntermediatesBlockNode, 0, 0, 0); // Bind Input
-    __visc__bindIn(IntermediatesBlockNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(IntermediatesBlockNode, 2, 2, 0); // Bind height
-    __visc__bindIn(IntermediatesBlockNode, 3, 3, 0); // Bind width
-    __visc__bindIn(IntermediatesBlockNode, 4, 4, 0); // Bind input_pitch
-    __visc__bindIn(IntermediatesBlockNode, 5, 5, 0); // Bind sm_mappings
-    __visc__bindIn(IntermediatesBlockNode, 6, 6, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(IntermediatesBlockNode, 7, 7, 0); // Bind block
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int* input;
-    size_t bytes_input;
-    unsigned height;
-    unsigned width;
-    unsigned input_pitch;
-    unsigned char* sm_mappings;
-    size_t bytes_sm_mappings;
-    int block;
-    int grid;
-}
-IntermediatesRootIn;
-
-void IntermediatesPackData( IntermediatesRootIn* args,
-                            unsigned int* input, size_t bytes_input,
-                            unsigned height,
-                            unsigned width,
-                            unsigned input_pitch,
-                            unsigned char* sm_mappings,
-                            size_t bytes_sm_mappings,
-                            int block,
-                            int grid
-                          ) {
-    args->input = input;
-    args->bytes_input = bytes_input;
-    args->height = height;
-    args->width = width;
-    args->input_pitch = input_pitch;
-    args->sm_mappings = sm_mappings;
-    args->bytes_sm_mappings = bytes_sm_mappings;
-    args->block = block;
-    args->grid = grid;
-}
-
-// Histo Main
-
-__attribute__((always_inline))
-static inline void testIncrementGlobal (
-    unsigned int *global_histo,
-    unsigned int sm_range_min,
-    unsigned int sm_range_max,
-    const uchar4 sm)
-{
-    const unsigned int range = sm.x;
-    const unsigned int indexhi = sm.y;
-    const unsigned int indexlo = sm.z;
-    const unsigned int offset  = sm.w;
-
-    /* Scan for inputs that are outside the central region of histogram */
-    if (range < sm_range_min || range > sm_range_max)
-    {
-        const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-        const unsigned int bin_div2 = bin / 2;
-        const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-        unsigned int old_val = global_histo[bin_div2];
-        unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-        if (old_bin < 255)
-        {
-            __visc__atomic_add ((int*)&global_histo[bin_div2], 1 << bin_offset);
-        }
-    }
-}
-
-#define smem(x,y) smem[(x)*256+(y)]
-
-__attribute__((always_inline))
-static inline void testIncrementLocal (
-    unsigned int *global_overflow,
-    unsigned int* smem,
-    const unsigned int myRange,
-    const uchar4 sm)
-{
-    const unsigned int range = sm.x;
-    const unsigned int indexhi = sm.y;
-    const unsigned int indexlo = sm.z;
-    const unsigned int offset  = sm.w;
-
-    /* Scan for inputs that are inside the central region of histogram */
-    if (range == myRange)
-    {
-        /* Atomically increment shared memory */
-        unsigned int add = (unsigned int)(1 << offset);
-        unsigned int prev = __visc__atomic_add ((int*)&smem(indexhi, indexlo), add);
-
-        /* Check if current bin overflowed */
-        unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-        /* If there was an overflow, record it and record if it cascaded into other bins */
-        if (prev_bin_val == 0x000000FF)
-        {
-            const unsigned int bin =
-                range * BINS_PER_BLOCK +
-                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-            bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-            bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-            bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-            bool overflow_into_bin_plus_1 = false;
-            bool overflow_into_bin_plus_2 = false;
-            bool overflow_into_bin_plus_3 = false;
-
-            unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-            unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-            unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-            if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-            if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-            if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-            unsigned int bin_plus_1_add;
-            unsigned int bin_plus_2_add;
-            unsigned int bin_plus_3_add;
-
-            if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-            if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-            if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-            __visc__atomic_add ((int*)&global_overflow[bin],   256);
-            if (overflow_into_bin_plus_1) __visc__atomic_add ((int*)&global_overflow[bin+1], bin_plus_1_add);
-            if (overflow_into_bin_plus_2) __visc__atomic_add ((int*)&global_overflow[bin+2], bin_plus_2_add);
-            if (overflow_into_bin_plus_3) __visc__atomic_add ((int*)&global_overflow[bin+3], bin_plus_3_add);
-        }
-    }
-}
-
-__attribute__((always_inline))
-static inline void clearMemory (unsigned int* smem, int lx, int dimx)
-{
-    for (int i = lx, blockDimx = dimx; i < BINS_PER_BLOCK / 4; i += blockDimx)
-    {
-        ((unsigned int*)smem)[i] = 0;
-    }
-}
-
-__attribute__((always_inline))
-static inline void copyMemory (unsigned int *dst, unsigned int* src, int lx, int dimx)
-{
-    for (int i = lx, blockDimx = dimx; i < BINS_PER_BLOCK/4; i += blockDimx)
-    {
-        dst[i] = ((unsigned int*)src)[i];
-    }
-}
-
-void MainAllocation (int block) {
-    // Memory shared between threadblocks
-    size_t bytes_sub_histo = sizeof(unsigned int) * KB * 256;
-    void* sub_histo  = __visc__malloc(bytes_sub_histo);
-
-    __visc__return(sub_histo, bytes_sub_histo);
-}
-
-void MainLeaf ( uchar4* sm_mappings, size_t bytes_sm_mappings,
-                unsigned int num_elements,
-                unsigned int sm_range_min,
-                unsigned int sm_range_max,
-                unsigned int histo_height,
-                unsigned int histo_width,
-                unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                unsigned int* global_histo, size_t bytes_global_histo,
-                unsigned int* global_overflow, size_t bytes_global_overflow,
-                // local memory
-                unsigned int* sub_histo, size_t bytes_sub_histo) {
-    /* Most optimal solution uses 24 * 1024 bins per threadblock */
-    //__local unsigned int sub_histo[KB][256];
-
-    /* Each threadblock contributes to a specific 24KB range of histogram,
-     * and also scans every N-th line for interesting data.  N = gridDim.x
-     */
-
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    unsigned int blockDimx = dimx;
-    unsigned int gridDimx = gdimx;
-    unsigned int local_scan_range = sm_range_min + gy;
-    unsigned int local_scan_load = gx * blockDimx + lx;
-
-    clearMemory (sub_histo, lx, dimx);
-    __visc__barrier();
-
-    if (gy == 0)
-    {
-        /* Loop through and scan the input */
-        while (local_scan_load < num_elements)
-        {
-            /* Read buffer */
-            uchar4 sm = sm_mappings[local_scan_load];
-            local_scan_load += blockDimx * gridDimx;
-
-            /* Check input */
-            testIncrementLocal (
-                global_overflow,
-                sub_histo,
-                local_scan_range,
-                sm
-            );
-            testIncrementGlobal (
-                global_histo,
-                sm_range_min,
-                sm_range_max,
-                sm
-            );
-        }
-    }
-    else
-    {
-        /* Loop through and scan the input */
-        while (local_scan_load < num_elements)
-        {
-            /* Read buffer */
-            uchar4 sm = sm_mappings[local_scan_load];
-            local_scan_load += blockDimx * gridDimx;
-
-            /* Check input */
-            testIncrementLocal (
-                global_overflow,
-                sub_histo,
-                local_scan_range,
-                sm
-            );
-        }
-    }
-
-    /* Store sub histogram to global memory */
-    unsigned int store_index = gx * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-    __visc__barrier();
-    copyMemory (&(global_subhisto[store_index]), sub_histo, lx, dimx);
-}
-
-void MainBlock (uchar4* sm_mappings, size_t bytes_sm_mappings,
-                unsigned int num_elements,
-                unsigned int sm_range_min,
-                unsigned int sm_range_max,
-                unsigned int histo_height,
-                unsigned int histo_width,
-                unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                unsigned int* global_histo, size_t bytes_global_histo,
-                unsigned int* global_overflow, size_t bytes_global_overflow,
-                int blockx, int blocky,
-                int gridx, int gridy) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* AllocationNode = __visc__createNode(MainAllocation);
-    void* MainLeafNode = __visc__createNode2D(MainLeaf, blockx, blocky);
-
-    // Bind Inputs
-    __visc__bindIn(AllocationNode, 5, 0, 0);  // Bind block
-
-    __visc__bindIn(MainLeafNode, 0, 0, 0); // Bind sm_mappings
-    __visc__bindIn(MainLeafNode, 1, 1, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(MainLeafNode, 2, 2, 0); // Bind num_elements
-    __visc__bindIn(MainLeafNode, 3, 3, 0); // Bind sm_range_min
-    __visc__bindIn(MainLeafNode, 4, 4, 0); // Bind sm_range_max
-    __visc__bindIn(MainLeafNode, 5, 5, 0); // Bind histo_height
-    __visc__bindIn(MainLeafNode, 6, 6, 0); // Bind histo_width
-    __visc__bindIn(MainLeafNode, 7, 7, 0); // Bind global_subhisto
-    __visc__bindIn(MainLeafNode, 8, 8, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(MainLeafNode, 9, 9, 0); // Bind global_histo
-    __visc__bindIn(MainLeafNode, 10, 10, 0); // Bind bytes_global_histo
-    __visc__bindIn(MainLeafNode, 11, 11, 0); // Bind overflow
-    __visc__bindIn(MainLeafNode, 12, 12, 0); // Bind bytes_overflow
-
-    // Create Edges
-    __visc__edge(AllocationNode, MainLeafNode, 1, 0, 13, 0); // Edge sub_histo
-    __visc__edge(AllocationNode, MainLeafNode, 1, 1, 14, 0); // Edge bytes_sub_histo
-
-}
-
-void MainRoot (uchar4* sm_mappings, size_t bytes_sm_mappings,
-               unsigned int num_elements,
-               unsigned int sm_range_min,
-               unsigned int sm_range_max,
-               unsigned int histo_height,
-               unsigned int histo_width,
-               unsigned int* global_subhisto, size_t bytes_global_subhisto,
-               unsigned int* global_histo, size_t bytes_global_histo,
-               unsigned int* global_overflow, size_t bytes_global_overflow,
-               int blockx, int blocky,
-               int gridx, int gridy) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* MainBlockNode = __visc__createNode2D(MainBlock, gridx, gridy);
-
-    // Bind Inputs
-    __visc__bindIn(MainBlockNode, 0, 0, 0); // Bind sm_mappings
-    __visc__bindIn(MainBlockNode, 1, 1, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(MainBlockNode, 2, 2, 0); // Bind num_elements
-    __visc__bindIn(MainBlockNode, 3, 3, 0); // Bind sm_range_min
-    __visc__bindIn(MainBlockNode, 4, 4, 0); // Bind sm_range_max
-    __visc__bindIn(MainBlockNode, 5, 5, 0); // Bind histo_height
-    __visc__bindIn(MainBlockNode, 6, 6, 0); // Bind histo_width
-    __visc__bindIn(MainBlockNode, 7, 7, 0); // Bind global_subhisto
-    __visc__bindIn(MainBlockNode, 8, 8, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(MainBlockNode, 9, 9, 0); // Bind global_histo
-    __visc__bindIn(MainBlockNode, 10, 10, 0); // Bind bytes_global_histo
-    __visc__bindIn(MainBlockNode, 11, 11, 0); // Bind overflow
-    __visc__bindIn(MainBlockNode, 12, 12, 0); // Bind bytes_overflow
-    __visc__bindIn(MainBlockNode, 13, 13, 0); // Bind blockx
-    __visc__bindIn(MainBlockNode, 14, 14, 0); // Bind blocky
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned char* sm_mappings;
-    size_t bytes_sm_mappings;
-    unsigned int num_elements;
-    unsigned int sm_range_min;
-    unsigned int sm_range_max;
-    unsigned int histo_height;
-    unsigned int histo_width;
-    unsigned int* global_subhisto;
-    size_t bytes_global_subhisto;
-    unsigned int* global_histo;
-    size_t bytes_global_histo;
-    unsigned int* global_overflow;
-    size_t bytes_global_overflow;
-    int blockx;
-    int blocky;
-    int gridx;
-    int gridy;
-}
-MainRootIn;
-
-
-void MainPackData(MainRootIn* args,
-                  unsigned char* sm_mappings, size_t bytes_sm_mappings,
-                  unsigned int num_elements,
-                  unsigned int sm_range_min,
-                  unsigned int sm_range_max,
-                  unsigned int histo_height,
-                  unsigned int histo_width,
-                  unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                  unsigned int* global_histo, size_t bytes_global_histo,
-                  unsigned int* global_overflow, size_t bytes_global_overflow,
-                  int blockx, int blocky,
-                  int gridx, int gridy
-                 ) {
-    args->sm_mappings = sm_mappings;
-    args->bytes_sm_mappings = bytes_sm_mappings;
-    args->num_elements = num_elements;
-    args->sm_range_min = sm_range_min;
-    args->sm_range_max = sm_range_max;
-    args->histo_height = histo_height;
-    args->histo_width = histo_width;
-    args->global_subhisto = global_subhisto;
-    args->bytes_global_subhisto = bytes_global_subhisto;
-    args->global_histo = global_histo;
-    args->bytes_global_histo = bytes_global_histo;
-    args->global_overflow = global_overflow;
-    args->bytes_global_overflow = bytes_global_overflow;
-    args->blockx = blockx;
-    args->blocky = blocky;
-    args->gridx = gridx;
-    args->gridy = gridy;
-}
-
-// Final
-typedef struct __attribute__((__packed__)) {
-    unsigned short x;
-    unsigned short y;
-    unsigned short z;
-    unsigned short w;
-}
-ushort4;
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int x;
-    unsigned int y;
-    unsigned int z;
-    unsigned int w;
-}
-uint4;
-/* Combine all the sub-histogram results into one final histogram */
-void FinalLeaf (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo) //final output
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-
-  
-    unsigned int blockDimx = dimx;
-    unsigned int gridDimx = gdimx;
-    unsigned int start_offset = lx + gx * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-        
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((uint4*)global_overflow)[i];
-        //((uint4*)global_overflow)[i] = zero_int;
-        global_overflow[i*4] = 0;
-        global_overflow[i*4+1] = 0;
-        global_overflow[i*4+2] = 0;
-        global_overflow[i*4+3] = 0;
-
-        uint4 internal_histo_data = (uint4){
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) internal_histo_data.x,
-            (unsigned char) internal_histo_data.y,
-            (unsigned char) internal_histo_data.z,
-            (unsigned char) internal_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
-
-void FinalBlock(
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-    
-    void* FinalLeafNode = __visc__createNode1D(FinalLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(FinalLeafNode, 0, 0, 0); // Bind sm_range_min
-    __visc__bindIn(FinalLeafNode, 1, 1, 0); // Bind sm_range_max
-    __visc__bindIn(FinalLeafNode, 2, 2, 0); // Bind histo_height
-    __visc__bindIn(FinalLeafNode, 3, 3, 0); // Bind histo_width
-    __visc__bindIn(FinalLeafNode, 4, 4, 0); // Bind global_subhisto
-    __visc__bindIn(FinalLeafNode, 5, 5, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(FinalLeafNode, 6, 6, 0); // Bind global_histo
-    __visc__bindIn(FinalLeafNode, 7, 7, 0); // Bind bytes_global_histo
-    __visc__bindIn(FinalLeafNode, 8, 8, 0); // Bind global_overflow
-    __visc__bindIn(FinalLeafNode, 9, 9, 0); // Bind bytes_global_overflow
-    __visc__bindIn(FinalLeafNode, 10, 10, 0); // Bind final_histo
-    __visc__bindIn(FinalLeafNode, 11, 11, 0); // Bind bytes_final_histo
-}
-
-void FinalRoot(
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block,
-    int grid) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-    
-    void* FinalBlockNode = __visc__createNode1D(FinalBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(FinalBlockNode, 0, 0, 0); // Bind sm_range_min
-    __visc__bindIn(FinalBlockNode, 1, 1, 0); // Bind sm_range_max
-    __visc__bindIn(FinalBlockNode, 2, 2, 0); // Bind histo_height
-    __visc__bindIn(FinalBlockNode, 3, 3, 0); // Bind histo_width
-    __visc__bindIn(FinalBlockNode, 4, 4, 0); // Bind global_subhisto
-    __visc__bindIn(FinalBlockNode, 5, 5, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(FinalBlockNode, 6, 6, 0); // Bind global_histo
-    __visc__bindIn(FinalBlockNode, 7, 7, 0); // Bind bytes_global_histo
-    __visc__bindIn(FinalBlockNode, 8, 8, 0); // Bind global_overflow
-    __visc__bindIn(FinalBlockNode, 9, 9, 0); // Bind bytes_global_overflow
-    __visc__bindIn(FinalBlockNode, 10, 10, 0); // Bind final_histo
-    __visc__bindIn(FinalBlockNode, 11, 11, 0); // Bind bytes_final_histo
-    __visc__bindIn(FinalBlockNode, 12, 12, 0); // Bind block
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int sm_range_min; 
-    unsigned int sm_range_max;
-    unsigned int histo_height; 
-    unsigned int histo_width;
-    unsigned int *global_subhisto;
-    size_t bytes_global_subhisto;
-    unsigned int *global_histo;
-    size_t bytes_global_histo;
-    unsigned int *global_overflow;
-    size_t bytes_global_overflow;
-    unsigned int *final_histo;
-    size_t bytes_final_histo; //final output
-    int block;
-    int grid;
-}
-FinalRootIn;
-
-void FinalPackData( FinalRootIn* args,
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block,
-    int grid) {
-  args->sm_range_min = sm_range_min; 
-  args->sm_range_max = sm_range_max;
-  args->histo_height = histo_height; 
-  args->histo_width = histo_width;
-  args->global_subhisto = global_subhisto;
-  args->bytes_global_subhisto = bytes_global_subhisto;
-  args->global_histo = global_histo;
-  args->bytes_global_histo = bytes_global_histo;
-  args->global_overflow = global_overflow;
-  args->bytes_global_overflow = bytes_global_overflow;
-  args->final_histo = final_histo;
-  args->bytes_final_histo = bytes_final_histo; //final output
-  args->block = block;
-  args->grid = grid;
-}
-
-
-int main(int argc, char* argv[]) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    parameters = pb_ReadParameters(&argc, argv);
-    if (!parameters)
-        return -1;
-
-    if(!parameters->inpFiles[0]) {
-        fputs("Input file expected\n", stderr);
-        return -1;
-    }
-
-    char viscOverhead[] = "viscOverhead";
-    char prescans[] = "PreScanKernel";
-    char postpremems[] = "PostPreMems";
-    char memsets[] = "Memsets";
-    char intermediates[] = "IntermediatesKernel";
-    char mains[] = "MainKernel";
-    char finals[] = "FinalKernel";
-
-    int numIterations;
-    if (argc >= 2) {
-        numIterations = atoi(argv[1]);
-    } else {
-        fputs("Expected at least one command line argument\n", stderr);
-        return -1;
-    }
-
-    unsigned int img_width, img_height;
-    unsigned int histo_width, histo_height;
-    unsigned int nThreads;
-
-    FILE* f = fopen(parameters->inpFiles[0],"rb");
-    int result = 0;
-
-    result += fread(&img_width,    sizeof(unsigned int), 1, f);
-    result += fread(&img_height,   sizeof(unsigned int), 1, f);
-    result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-    result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-    if (result != 4) {
-        fputs("Error reading input and output dimensions from file\n", stderr);
-        return -1;
-    }
-
-    unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-    unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-    result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-    fclose(f);
-
-    if (result != img_width*img_height) {
-        fputs("Error reading input array from file\n", stderr);
-        return -1;
-    }
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-    pb_AddSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, postpremems, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, memsets, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-
-    void* histo_prescan_dfg;
-    void* histo_intermediates_dfg;
-    void* histo_main_dfg;
-    void* histo_final_kernel;
-
-    int even_width = ((img_width+1)/2)*2;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    size_t bytes_input = even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int);
-    size_t bytes_ranges = 2*sizeof(unsigned int);
-    size_t bytes_sm_mappings = img_width*img_height*4*sizeof(unsigned char);
-    size_t bytes_global_subhisto = img_width*histo_height*sizeof(unsigned int);
-    size_t bytes_global_histo = img_width*histo_height*sizeof(unsigned short);
-    size_t bytes_global_overflow = img_width*histo_height*sizeof(unsigned int);
-    size_t bytes_final_histo = img_width*histo_height*sizeof(unsigned char);
-
-    unsigned* input = (unsigned*) malloc(bytes_input);
-    unsigned* ranges = (unsigned*) malloc(bytes_ranges);
-    unsigned char* sm_mappings = (unsigned char*) malloc(bytes_sm_mappings);
-    unsigned int* global_subhisto = (unsigned int*) malloc(bytes_global_subhisto);
-    unsigned short* global_histo = (unsigned short*) malloc(bytes_global_histo);
-    unsigned int* global_overflow = (unsigned int*) malloc(bytes_global_overflow);
-    unsigned char* final_histo = (unsigned char*) malloc(bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(input, bytes_input);
-    llvm_visc_track_mem(ranges, bytes_ranges);
-    llvm_visc_track_mem(sm_mappings, bytes_sm_mappings);
-    llvm_visc_track_mem(global_histo, bytes_global_histo);
-    llvm_visc_track_mem(global_overflow, bytes_global_overflow);
-    llvm_visc_track_mem(final_histo, bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    switch (KB) {
-    case 48:
-        nThreads = 1024;
-        break;
-    case 24:
-        nThreads = 768;
-        break;
-    default:
-        nThreads = 512;
-        break;
-    }
-
-
-    // FIXME: How to do padding without these many copies
-    for (int y=0; y < img_height; y++) {
-        memcpy(&input[y*even_width], &img[y*img_width], img_width*sizeof(unsigned int));
-    }
-
-    int prescan_localWS = PRESCAN_THREADS;
-    int prescan_globalWS = PRESCAN_BLOCKS_X;
-    int inter_localWS = (img_width+1)/2;
-    int inter_globalWS = ((img_height + UNROLL-1)/UNROLL);
-    int main_localWS[2] = {(int)nThreads, 1};
-    int main_globalWS[2];
-    main_globalWS[0] = BLOCK_X;
-    int final_localWS = 512;
-    int final_globalWS = BLOCK_X*3;
-
-    pb_SwitchToTimer(&timers, visc_TimerID_ARG_PACK);
-
-    unsigned int img_dim = img_height*img_width;
-    PrescanRootIn* argsP = (PrescanRootIn*) malloc(sizeof(PrescanRootIn));
-    PrescanPackData(argsP,
-                    input, bytes_input,
-                    img_dim,
-                    ranges, bytes_ranges,
-                    prescan_localWS, prescan_globalWS);
-
-    unsigned int half_width = (img_width+1)/2;
-
-    IntermediatesRootIn* argsI = (IntermediatesRootIn*) malloc(sizeof(IntermediatesRootIn));
-    IntermediatesPackData(argsI,
-                          input, bytes_input,
-                          img_height,
-                          img_width,
-                          half_width,
-                          sm_mappings, bytes_sm_mappings,
-                          inter_localWS, inter_globalWS);
-
-    MainRootIn* argsM = (MainRootIn*) malloc (sizeof(MainRootIn));
-    MainPackData( argsM,
-                  sm_mappings, bytes_sm_mappings,
-                  img_dim,
-                  ranges[0], // This would actually be set inside loop
-                  ranges[1], // This would actually be set inside loop
-                  histo_height,
-                  histo_width,
-                  global_subhisto, bytes_global_subhisto,
-                  (unsigned int*)global_histo, bytes_global_histo,
-                  global_overflow, bytes_global_overflow,
-                  main_localWS[0], main_localWS[1],
-                  main_globalWS[0], main_globalWS[1]
-                  );
-
-
-    FinalRootIn* argsF = (FinalRootIn*) malloc(sizeof(FinalRootIn));
-    FinalPackData(argsF,
-                  ranges[0], // This would actually be set inside loop
-                  ranges[1], // This would actually be set inside loop
-                  histo_height,
-                  histo_width,
-                  global_subhisto, bytes_global_subhisto,
-                  (unsigned int*)global_histo, bytes_global_histo,
-                  global_overflow, bytes_global_overflow,
-                  (unsigned int*)final_histo, bytes_final_histo,
-                  final_localWS, final_globalWS
-                  );
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-
-#ifndef debug
-    char inputf[] = "input-visc";
-    char sm_mappingsf[] = "sm_mappings-visc";
-    char global_subhistof[] = "global_subhisto-visc";
-    char global_histof[] = "global_histo-visc";
-    char global_overflowf[] = "global_overflow-visc";
-    numIterations = 1;
-#endif
-
-        //memset(global_subhisto, 0, img_width*histo_height*sizeof(unsigned int));
-    for (int iter = 0; iter < numIterations; iter++) {
-        llvm_visc_track_mem(global_subhisto, bytes_global_subhisto);
-        pb_SwitchToSubTimer(&timers, postpremems , visc_TimerID_COMPUTATION);
-        //llvm_visc_request_mem(ranges, bytes_ranges);
-        //llvm_visc_request_mem(input, bytes_input);
-        ranges[0] = UINT32_MAX;
-        ranges[1] = 0;
-
-        // how about something like
-        // __global__ unsigned int ranges[2];
-        // ...kernel
-        // __shared__ unsigned int s_ranges[2];
-        // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
-        // __syncthreads();
-
-        // Although then removing the blocking cudaMemcpy's might cause something about
-        // concurrent kernel execution.
-        // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?
-
-#ifndef debug        
-        dump_histo_img((unsigned char*)input, 1, bytes_input, inputf);
-#endif
-        pb_SwitchToSubTimer(&timers, prescans , visc_TimerID_COMPUTATION);
-        void* PrescanDFG = __visc__launch(0, PrescanRoot, (void*)argsP);
-        __visc__wait(PrescanDFG);
-
-        pb_SwitchToSubTimer(&timers, postpremems , visc_TimerID_COMPUTATION);
-        
-        llvm_visc_request_mem(ranges, bytes_ranges);
-
-        //printf("Range: (%d, %d)\n", ranges[0], ranges[1]);
-#ifndef debug
-        printf("Range: (%d, %d)\n", ranges[0], ranges[1]);
-#endif
-        // Requesting only so that we can write it to zero. IT should not be
-        // copied from device to host
-        llvm_visc_request_mem(global_subhisto, bytes_global_subhisto);
-        // Set global_subhisto to zero
-        memset(global_subhisto, 0, bytes_global_subhisto);
-        //pb_SwitchToSubTimer(&timers, memsets , visc_TimerID_COMPUTATION);
-
-        pb_SwitchToSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-
-        void* IntermediatesDFG = __visc__launch(0, IntermediatesRoot, (void*)argsI);
-        __visc__wait(IntermediatesDFG);
-
-        pb_SwitchToSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-#ifndef debug
-        llvm_visc_request_mem(sm_mappings, bytes_sm_mappings);
-        dump_histo_img(sm_mappings, 1, bytes_sm_mappings, sm_mappingsf);
-#endif
-        argsM->gridy = ranges[1]-ranges[0]+1;
-
-        argsM->sm_range_min = ranges[0];
-        argsM->sm_range_max = ranges[1];
-        argsF->sm_range_min = ranges[0];
-        argsF->sm_range_max = ranges[1];
-        
-        pb_SwitchToSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-
-        void* MainDFG = __visc__launch(0, MainRoot, (void*) argsM);
-        __visc__wait(MainDFG);
-#ifndef debug
-        llvm_visc_request_mem(global_subhisto, bytes_global_subhisto);
-        dump_histo_img((unsigned char*)global_subhisto, 1, bytes_global_subhisto, global_subhistof);
-
-        llvm_visc_request_mem(global_histo, bytes_global_histo);
-        dump_histo_img((unsigned char*)global_histo, 1, bytes_global_histo, global_histof);
-
-        llvm_visc_request_mem(global_overflow, bytes_global_overflow);
-        dump_histo_img((unsigned char*)global_overflow, 1, bytes_global_overflow, global_overflowf);
-#endif
-        pb_SwitchToSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-        void* FinalDFG = __visc__launch(0, FinalRoot, (void*) argsF);
-        __visc__wait(FinalDFG);
-
-        pb_SwitchToSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-        llvm_visc_untrack_mem(global_subhisto);
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(final_histo, bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    llvm_visc_untrack_mem(input);
-    llvm_visc_untrack_mem(ranges);
-    llvm_visc_untrack_mem(sm_mappings);
-    llvm_visc_untrack_mem(global_subhisto);
-    llvm_visc_untrack_mem(global_histo);
-    llvm_visc_untrack_mem(global_overflow);
-    llvm_visc_untrack_mem(final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (parameters->outFile) {
-        dump_histo_img(final_histo, histo_height, histo_width, parameters->outFile);
-    }
-
-    //pb_SwitchToTimer(&timers, visc_TimerID_MEM_FREE);
-
-    free(img);
-    free(input);
-    free(ranges);
-    free(sm_mappings);
-    free(global_subhisto);
-    free(global_histo);
-    free(global_overflow);
-    free(final_histo);
-
-    //pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    printf("\n");
-    pb_FreeParameters(parameters);
-
-    pb_DestroyTimerSet(&timers);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc/util.cpp b/hpvm/test/parboil/benchmarks/histo/src/visc/util.cpp
deleted file mode 100644
index 266462c936..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc/util.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#include "util.h"
-#include "bmp.h"
-
-// This function takes an HSV value and converts it to BMP.
-// We use this function to generate colored images with
-// Smooth spectrum traversal for the input and output images.
-RGB HSVtoRGB( float h, float s, float v )
-{
-    int i;
-    float f, p, q, t;
-    float r, g, b;
-    RGB value={0,0,0};
-
-    if( s == 0 ) {
-        r = g = b = v;
-        return value;
-    }
-    h /= 60;
-    i = floor( h );
-    f = h - i;
-    p = v * ( 1 - s );
-    q = v * ( 1 - s * f );
-    t = v * ( 1 - s * ( 1 - f ) );
-    switch( i ) {
-        case 0:
-            r = v; g = t; b = p;
-            break;
-        case 1:
-            r = q; g = v; b = p;
-            break;
-        case 2:
-            r = p; g = v; b = t;
-            break;
-        case 3:
-            r = p; g = q; b = v;
-            break;
-        case 4:
-            r = t; g = p; b = v;
-            break;
-        default:
-            r = v; g = p; b = q;
-            break;
-    }
-
-    unsigned int temp = r*255;
-    value.R = temp;
-    temp = g*255;
-    value.G = temp;
-    temp = b*255;
-    value.B = temp;
-
-    return value;
-}
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename)
-{
-    RGB* pixel_map = (RGB*) malloc (height*width*sizeof(RGB));
-
-    for (size_t y = 0; y < height; ++y)
-    {
-        for (size_t x = 0; x < width; ++x)
-        {
-            unsigned char value = histo[y * width + x];
-
-            if (value == 0){
-                pixel_map[y*width+x].R = 0;
-                pixel_map[y*width+x].G = 0;
-                pixel_map[y*width+x].B = 0;
-            } else {
-                pixel_map[y*width+x] = HSVtoRGB(0.0,1.0,cbrt(1+ 63.0*((float)value)/((float)UINT8_MAX))/4);
-            }
-        }
-    }
-    create_bmp(pixel_map, height, width, filename);
-    free(pixel_map);
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc/util.h b/hpvm/test/parboil/benchmarks/histo/src/visc/util.h
deleted file mode 100644
index 8db501970c..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc/util.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef __HISTO_UTIL_H_
-#define __HISTO_UTIL_H_
-
-#define UINT8_MAX 255
-#define UINT32_MAX 4294967295
-
-void dump_histo_img(unsigned char* histo, unsigned int height, unsigned int width, const char *filename);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/histo/src/visc_one/main.cpp b/hpvm/test/parboil/benchmarks/histo/src/visc_one/main.cpp
deleted file mode 100755
index cf74b38390..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/src/visc_one/main.cpp
+++ /dev/null
@@ -1,1272 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <visc.h>
-#include "util.h"
-
-#define BLOCK_X             14
-
-#define PRESCAN_THREADS     512
-#define PRESCAN_BLOCKS_X    64
-
-#define UNROLL              16
-
-#define KB                  24
-#define BINS_PER_BLOCK      ((KB)*1024)
-
-#define min(x,y)            ((x) < (y))? (x) : (y)
-
-#define debug
-//#undef debug
-
-/******************************************************************************
-* Implementation: GPU
-* Details:
-* in the GPU implementation of histogram, we begin by computing the span of the
-* input values into the histogram. Then the histogramming computation is carried
-* out by a (BLOCK_X, BLOCK_Y) sized grid, where every group of Y (same X)
-* computes its own partial histogram for a part of the input, and every Y in the
-* group exclusively writes to a portion of the span computed in the beginning.
-* Finally, a reduction is performed to combine all the partial histograms into
-* the final result.
-******************************************************************************/
-// Prescan
-
-void PrescanAllocation (int block) {
-    // Memory shared between threadblocks
-    size_t bytes_Avg = sizeof(float)*PRESCAN_THREADS;
-    void* Avg  = __visc__malloc(bytes_Avg);
-
-    size_t bytes_StdDev = sizeof(float)*PRESCAN_THREADS;
-    void* StdDev = __visc__malloc(bytes_StdDev);
-    __visc__return(Avg, bytes_Avg, StdDev, bytes_StdDev);
-}
-
-void PrescanLeaf (unsigned int* input, size_t bytes_input,
-                  int size,
-                  unsigned int* minmax, size_t bytes_minmax,
-                  // local memory arguments
-                  float* Avg, size_t bytes_Avg,
-                  float* StdDev, size_t bytes_StdDev) {
-
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(2, input, minmax, 1, minmax);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-    int threadIdxx = lx;
-    int blockDimx = dimx;
-    int blockIdxx = gx;
-    int stride = size/gdimx;
-    int addr = blockIdxx*stride+threadIdxx;
-    int end = blockIdxx*stride + stride/8; // Only sample 1/8th of the input data
-
-    // Compute the average per thread
-    float avg = 0.0;
-    unsigned int count = 0;
-    while (addr < end) {
-        avg += input[addr];
-        count++;
-        addr += blockDimx;
-    }
-    avg /= count;
-    Avg[threadIdxx] = avg;
-
-    // Compute the standard deviation per thread
-    int addr2 = blockIdxx*stride+threadIdxx;
-    float stddev = 0;
-    while (addr2 < end) {
-        stddev += (input[addr2]-avg)*(input[addr2]-avg);
-        addr2 += blockDimx;
-    }
-    stddev /= count;
-    StdDev[threadIdxx] = __visc__sqrt(stddev);
-
-#define SUM(stride__)\
-if(threadIdxx < stride__){\
-    Avg[threadIdxx] += Avg[threadIdxx+stride__];\
-    StdDev[threadIdxx] += StdDev[threadIdxx+stride__];\
-}
-
-    // Add all the averages and standard deviations from all the threads
-    // and take their arithmetic average (as a simplified approximation of the
-    // real average and standard deviation.
-#if (PRESCAN_THREADS >= 32)
-    for (int stride = PRESCAN_THREADS/2; stride >= 32; stride = stride >> 1) {
-        __visc__barrier();
-        SUM(stride);
-    }
-#endif
-#if (PRESCAN_THREADS >= 16)
-    SUM(16);
-#endif
-#if (PRESCAN_THREADS >= 8)
-    SUM(8);
-#endif
-#if (PRESCAN_THREADS >= 4)
-    SUM(4);
-#endif
-#if (PRESCAN_THREADS >= 2)
-    SUM(2);
-#endif
-
-    if (threadIdxx == 0) {
-        float avg = Avg[0]+Avg[1];
-        avg /= PRESCAN_THREADS;
-        float stddev = StdDev[0]+StdDev[1];
-        stddev /= PRESCAN_THREADS;
-
-        // Take the maximum and minimum range from all the blocks. This will
-        // be the final answer. The standard deviation is taken out to 10 sigma
-        // away from the average. The value 10 was obtained empirically.
-        __visc__atomic_umin((int*)minmax,((unsigned int)(avg-10*stddev))/(KB*1024));
-        __visc__atomic_umax((int*)minmax+1,((unsigned int)(avg+10*stddev))/(KB*1024));
-    }
-}
-
-void PrescanBlock (unsigned int* input, size_t bytes_input,
-                   int size,
-                   unsigned int* minmax, size_t bytes_minmax,
-                   int block) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(2, input, minmax, 1, minmax);
-
-    void* AllocationNode = __visc__createNode(PrescanAllocation);
-    void* PrescanLeafNode = __visc__createNode1D(PrescanLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(AllocationNode, 5, 0, 0);  // Bind block
-
-    __visc__bindIn(PrescanLeafNode, 0, 0, 0); // Bind input
-    __visc__bindIn(PrescanLeafNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(PrescanLeafNode, 2, 2, 0); // Bind size
-    __visc__bindIn(PrescanLeafNode, 3, 3, 0); // Bind minmax
-    __visc__bindIn(PrescanLeafNode, 4, 4, 0); // Bind bytes_minmax
-
-    // Create Edges
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 0, 5, 0); // Edge Avg
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 1, 6, 0); // Edge bytes_Avg
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 2, 7, 0); // Edge StdDev
-    __visc__edge(AllocationNode, PrescanLeafNode, 1, 3, 8, 0); // Edge bytes_StdDev
-
-}
-
-void PrescanRoot (unsigned int* input, size_t bytes_input,
-                  int size,
-                  unsigned int* minmax, size_t bytes_minmax,
-                  int block,
-                  int grid) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, input, minmax, 1, minmax);
-    //__visc__attributes(1, input, 1, minmax);
-
-    void* PrescanBlockNode = __visc__createNode1D(PrescanBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(PrescanBlockNode, 0, 0, 0); // Bind input
-    __visc__bindIn(PrescanBlockNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(PrescanBlockNode, 2, 2, 0); // Bind size
-    __visc__bindIn(PrescanBlockNode, 3, 3, 0); // Bind minmax
-    __visc__bindIn(PrescanBlockNode, 4, 4, 0); // Bind bytes_minmax
-    __visc__bindIn(PrescanBlockNode, 5, 5, 0); // Bind block
-
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int* input;
-    size_t bytes_input;
-    int size;
-    unsigned int* minmax;
-    size_t bytes_minmax;
-    int block;
-    int grid;
-}
-PrescanRootIn;
-
-void PrescanPackData( PrescanRootIn* args,
-                      unsigned int* input, size_t bytes_input,
-                      int size,
-                      unsigned int* minmax, size_t bytes_minmax,
-                      int block,
-                      int grid
-                    ) {
-    args->input = input;
-    args->bytes_input = bytes_input;
-    args->size = size;
-    args->minmax = minmax;
-    args->bytes_minmax = bytes_minmax;
-    args->block = block;
-    args->grid = grid;
-}
-
-// Intermediates
-typedef struct __attribute__((__packed__)) {
-    unsigned char x;
-    unsigned char y;
-    unsigned char z;
-    unsigned char w;
-}
-uchar4;
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int x;
-    unsigned int y;
-}
-uint2;
-
-__attribute__((always_inline))
-static inline void calculateBin (unsigned int bin, uchar4 *sm_mapping) {
-
-    unsigned char offset  =  bin        %   4;
-    unsigned char indexlo = (bin >>  2) % 256;
-    unsigned char indexhi = (bin >> 10) %  KB;
-    unsigned char block   =  bin / BINS_PER_BLOCK;
-
-    offset *= 8;
-
-    uchar4 sm;
-    sm.x = block;
-    sm.y = indexhi;
-    sm.z = indexlo;
-    sm.w = offset;
-
-    *sm_mapping = sm;
-}
-
-void IntermediatesLeaf( uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-
-    int threadIdxx = lx;
-    int blockDimx = dimx;
-    unsigned int line = UNROLL * (gx);// 16 is the unroll factor;
-
-    uint2 *load_bin = input + line * input_pitch + threadIdxx;
-
-    unsigned int store = line * width + threadIdxx;
-    bool skip = (width % 2) && (threadIdxx == (blockDimx - 1));
-
-#pragma unroll
-    for (int i = 0; i < UNROLL; i++)
-    {
-        uint2 bin_value = *load_bin;
-
-        calculateBin (
-            bin_value.x,
-            &sm_mappings[store]
-        );
-
-        if (!skip) calculateBin (
-                bin_value.y,
-                &sm_mappings[store + blockDimx]
-            );
-
-        load_bin += input_pitch;
-        store += width;
-    }
-}
-
-void IntermediatesBlock(uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings,
-                        int block) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* IntermediatesLeafNode = __visc__createNode1D(IntermediatesLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(IntermediatesLeafNode, 0, 0, 0); // Bind Input
-    __visc__bindIn(IntermediatesLeafNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(IntermediatesLeafNode, 2, 2, 0); // Bind height
-    __visc__bindIn(IntermediatesLeafNode, 3, 3, 0); // Bind width
-    __visc__bindIn(IntermediatesLeafNode, 4, 4, 0); // Bind input_pitch
-    __visc__bindIn(IntermediatesLeafNode, 5, 5, 0); // Bind sm_mappings
-    __visc__bindIn(IntermediatesLeafNode, 6, 6, 0); // Bind bytes_sm_mappings
-}
-
-void IntermediatesRoot( uint2 *input, size_t bytes_input,
-                        unsigned int height,
-                        unsigned int width,
-                        unsigned int input_pitch,
-                        uchar4 *sm_mappings, size_t bytes_sm_mappings,
-                        int block,
-                        int grid) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(1, input, 1, sm_mappings);
-
-    void* IntermediatesBlockNode = __visc__createNode1D(IntermediatesBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(IntermediatesBlockNode, 0, 0, 0); // Bind Input
-    __visc__bindIn(IntermediatesBlockNode, 1, 1, 0); // Bind bytes_input
-    __visc__bindIn(IntermediatesBlockNode, 2, 2, 0); // Bind height
-    __visc__bindIn(IntermediatesBlockNode, 3, 3, 0); // Bind width
-    __visc__bindIn(IntermediatesBlockNode, 4, 4, 0); // Bind input_pitch
-    __visc__bindIn(IntermediatesBlockNode, 5, 5, 0); // Bind sm_mappings
-    __visc__bindIn(IntermediatesBlockNode, 6, 6, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(IntermediatesBlockNode, 7, 7, 0); // Bind block
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int* input;
-    size_t bytes_input;
-    unsigned height;
-    unsigned width;
-    unsigned input_pitch;
-    unsigned char* sm_mappings;
-    size_t bytes_sm_mappings;
-    int block;
-    int grid;
-}
-IntermediatesRootIn;
-
-void IntermediatesPackData( IntermediatesRootIn* args,
-                            unsigned int* input, size_t bytes_input,
-                            unsigned height,
-                            unsigned width,
-                            unsigned input_pitch,
-                            unsigned char* sm_mappings,
-                            size_t bytes_sm_mappings,
-                            int block,
-                            int grid
-                          ) {
-    args->input = input;
-    args->bytes_input = bytes_input;
-    args->height = height;
-    args->width = width;
-    args->input_pitch = input_pitch;
-    args->sm_mappings = sm_mappings;
-    args->bytes_sm_mappings = bytes_sm_mappings;
-    args->block = block;
-    args->grid = grid;
-}
-
-// Histo Main
-
-__attribute__((always_inline))
-static inline void testIncrementGlobal (
-    unsigned int *global_histo,
-    unsigned int sm_range_min,
-    unsigned int sm_range_max,
-    const uchar4 sm)
-{
-    const unsigned int range = sm.x;
-    const unsigned int indexhi = sm.y;
-    const unsigned int indexlo = sm.z;
-    const unsigned int offset  = sm.w;
-
-    /* Scan for inputs that are outside the central region of histogram */
-    if (range < sm_range_min || range > sm_range_max)
-    {
-        const unsigned int bin = range * BINS_PER_BLOCK + offset / 8 + (indexlo << 2) + (indexhi << 10);
-        const unsigned int bin_div2 = bin / 2;
-        const unsigned int bin_offset = (bin % 2 == 1) ? 16 : 0;
-
-        unsigned int old_val = global_histo[bin_div2];
-        unsigned short old_bin = (old_val >> bin_offset) & 0xFFFF;
-
-        if (old_bin < 255)
-        {
-            __visc__atomic_add ((int*)&global_histo[bin_div2], 1 << bin_offset);
-        }
-    }
-}
-
-#define smem(x,y) smem[(x)*256+(y)]
-
-__attribute__((always_inline))
-static inline void testIncrementLocal (
-    unsigned int *global_overflow,
-    unsigned int* smem,
-    const unsigned int myRange,
-    const uchar4 sm)
-{
-    const unsigned int range = sm.x;
-    const unsigned int indexhi = sm.y;
-    const unsigned int indexlo = sm.z;
-    const unsigned int offset  = sm.w;
-
-    /* Scan for inputs that are inside the central region of histogram */
-    if (range == myRange)
-    {
-        /* Atomically increment shared memory */
-        unsigned int add = (unsigned int)(1 << offset);
-        unsigned int prev = __visc__atomic_add ((int*)&smem(indexhi, indexlo), add);
-
-        /* Check if current bin overflowed */
-        unsigned int prev_bin_val = (prev >> offset) & 0x000000FF;
-
-        /* If there was an overflow, record it and record if it cascaded into other bins */
-        if (prev_bin_val == 0x000000FF)
-        {
-            const unsigned int bin =
-                range * BINS_PER_BLOCK +
-                offset / 8 + (indexlo << 2) + (indexhi << 10);
-
-            bool can_overflow_to_bin_plus_1 = (offset < 24) ? true : false;
-            bool can_overflow_to_bin_plus_2 = (offset < 16) ? true : false;
-            bool can_overflow_to_bin_plus_3 = (offset <  8) ? true : false;
-
-            bool overflow_into_bin_plus_1 = false;
-            bool overflow_into_bin_plus_2 = false;
-            bool overflow_into_bin_plus_3 = false;
-
-            unsigned int prev_bin_plus_1_val = (prev >> (offset +  8)) & 0x000000FF;
-            unsigned int prev_bin_plus_2_val = (prev >> (offset + 16)) & 0x000000FF;
-            unsigned int prev_bin_plus_3_val = (prev >> (offset + 24)) & 0x000000FF;
-
-            if (can_overflow_to_bin_plus_1 &&        prev_bin_val == 0x000000FF) overflow_into_bin_plus_1 = true;
-            if (can_overflow_to_bin_plus_2 && prev_bin_plus_1_val == 0x000000FF) overflow_into_bin_plus_2 = true;
-            if (can_overflow_to_bin_plus_3 && prev_bin_plus_2_val == 0x000000FF) overflow_into_bin_plus_3 = true;
-
-            unsigned int bin_plus_1_add;
-            unsigned int bin_plus_2_add;
-            unsigned int bin_plus_3_add;
-
-            if (overflow_into_bin_plus_1) bin_plus_1_add = (prev_bin_plus_1_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-            if (overflow_into_bin_plus_2) bin_plus_2_add = (prev_bin_plus_2_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-            if (overflow_into_bin_plus_3) bin_plus_3_add = (prev_bin_plus_3_val < 0x000000FF) ? 0xFFFFFFFF : 0x000000FF;
-
-            __visc__atomic_add ((int*)&global_overflow[bin],   256);
-            if (overflow_into_bin_plus_1) __visc__atomic_add ((int*)&global_overflow[bin+1], bin_plus_1_add);
-            if (overflow_into_bin_plus_2) __visc__atomic_add ((int*)&global_overflow[bin+2], bin_plus_2_add);
-            if (overflow_into_bin_plus_3) __visc__atomic_add ((int*)&global_overflow[bin+3], bin_plus_3_add);
-        }
-    }
-}
-
-__attribute__((always_inline))
-static inline void clearMemory (unsigned int* smem, int lx, int dimx)
-{
-    for (int i = lx, blockDimx = dimx; i < BINS_PER_BLOCK / 4; i += blockDimx)
-    {
-        ((unsigned int*)smem)[i] = 0;
-    }
-}
-
-__attribute__((always_inline))
-static inline void copyMemory (unsigned int *dst, unsigned int* src, int lx, int dimx)
-{
-    for (int i = lx, blockDimx = dimx; i < BINS_PER_BLOCK/4; i += blockDimx)
-    {
-        dst[i] = ((unsigned int*)src)[i];
-    }
-}
-
-void MainAllocation (int block) {
-    // Memory shared between threadblocks
-    size_t bytes_sub_histo = sizeof(unsigned int) * KB * 256;
-    void* sub_histo  = __visc__malloc(bytes_sub_histo);
-
-    __visc__return(sub_histo, bytes_sub_histo);
-}
-
-void MainLeaf ( uchar4* sm_mappings, size_t bytes_sm_mappings,
-                unsigned int num_elements,
-                unsigned int sm_range_min,
-                unsigned int sm_range_max,
-                unsigned int histo_height,
-                unsigned int histo_width,
-                unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                unsigned int* global_histo, size_t bytes_global_histo,
-                unsigned int* global_overflow, size_t bytes_global_overflow,
-                // local memory
-                unsigned int* sub_histo, size_t bytes_sub_histo) {
-    /* Most optimal solution uses 24 * 1024 bins per threadblock */
-    //__local unsigned int sub_histo[KB][256];
-
-    /* Each threadblock contributes to a specific 24KB range of histogram,
-     * and also scans every N-th line for interesting data.  N = gridDim.x
-     */
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    unsigned int blockDimx = dimx;
-    unsigned int gridDimx = gdimx;
-    unsigned int local_scan_range = sm_range_min + gy;
-    unsigned int local_scan_load = gx * blockDimx + lx;
-
-    clearMemory (sub_histo, lx, dimx);
-    __visc__barrier();
-
-    if (gy == 0)
-    {
-        /* Loop through and scan the input */
-        while (local_scan_load < num_elements)
-        {
-            /* Read buffer */
-            uchar4 sm = sm_mappings[local_scan_load];
-            local_scan_load += blockDimx * gridDimx;
-
-            /* Check input */
-            testIncrementLocal (
-                global_overflow,
-                sub_histo,
-                local_scan_range,
-                sm
-            );
-            testIncrementGlobal (
-                global_histo,
-                sm_range_min,
-                sm_range_max,
-                sm
-            );
-        }
-    }
-    else
-    {
-        /* Loop through and scan the input */
-        while (local_scan_load < num_elements)
-        {
-            /* Read buffer */
-            uchar4 sm = sm_mappings[local_scan_load];
-            local_scan_load += blockDimx * gridDimx;
-
-            /* Check input */
-            testIncrementLocal (
-                global_overflow,
-                sub_histo,
-                local_scan_range,
-                sm
-            );
-        }
-    }
-
-    /* Store sub histogram to global memory */
-    unsigned int store_index = gx * (histo_height * histo_width / 4) + (local_scan_range * BINS_PER_BLOCK / 4);//(local_scan_range * BINS_PER_BLOCK);
-
-    __visc__barrier();
-    copyMemory (&(global_subhisto[store_index]), sub_histo, lx, dimx);
-}
-
-void MainBlock (uchar4* sm_mappings, size_t bytes_sm_mappings,
-                unsigned int num_elements,
-                unsigned int sm_range_min,
-                unsigned int sm_range_max,
-                unsigned int histo_height,
-                unsigned int histo_width,
-                unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                unsigned int* global_histo, size_t bytes_global_histo,
-                unsigned int* global_overflow, size_t bytes_global_overflow,
-                int blockx, int blocky,
-                int gridx, int gridy) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* AllocationNode = __visc__createNode(MainAllocation);
-    void* MainLeafNode = __visc__createNode2D(MainLeaf, blockx, blocky);
-
-    // Bind Inputs
-    __visc__bindIn(AllocationNode, 5, 0, 0);  // Bind block
-
-    __visc__bindIn(MainLeafNode, 0, 0, 0); // Bind sm_mappings
-    __visc__bindIn(MainLeafNode, 1, 1, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(MainLeafNode, 2, 2, 0); // Bind num_elements
-    __visc__bindIn(MainLeafNode, 3, 3, 0); // Bind sm_range_min
-    __visc__bindIn(MainLeafNode, 4, 4, 0); // Bind sm_range_max
-    __visc__bindIn(MainLeafNode, 5, 5, 0); // Bind histo_height
-    __visc__bindIn(MainLeafNode, 6, 6, 0); // Bind histo_width
-    __visc__bindIn(MainLeafNode, 7, 7, 0); // Bind global_subhisto
-    __visc__bindIn(MainLeafNode, 8, 8, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(MainLeafNode, 9, 9, 0); // Bind global_histo
-    __visc__bindIn(MainLeafNode, 10, 10, 0); // Bind bytes_global_histo
-    __visc__bindIn(MainLeafNode, 11, 11, 0); // Bind overflow
-    __visc__bindIn(MainLeafNode, 12, 12, 0); // Bind bytes_overflow
-
-    // Create Edges
-    __visc__edge(AllocationNode, MainLeafNode, 1, 0, 13, 0); // Edge sub_histo
-    __visc__edge(AllocationNode, MainLeafNode, 1, 1, 14, 0); // Edge bytes_sub_histo
-
-}
-
-void MainRoot (uchar4* sm_mappings, size_t bytes_sm_mappings,
-               unsigned int num_elements,
-               unsigned int sm_range_min,
-               unsigned int sm_range_max,
-               unsigned int histo_height,
-               unsigned int histo_width,
-               unsigned int* global_subhisto, size_t bytes_global_subhisto,
-               unsigned int* global_histo, size_t bytes_global_histo,
-               unsigned int* global_overflow, size_t bytes_global_overflow,
-               int blockx, int blocky,
-               int gridx, int gridy) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(1, sm_mappings, 3, global_subhisto, global_histo, global_overflow);
-
-    void* MainBlockNode = __visc__createNode2D(MainBlock, gridx, gridy);
-
-    // Bind Inputs
-    __visc__bindIn(MainBlockNode, 0, 0, 0); // Bind sm_mappings
-    __visc__bindIn(MainBlockNode, 1, 1, 0); // Bind bytes_sm_mappings
-    __visc__bindIn(MainBlockNode, 2, 2, 0); // Bind num_elements
-    __visc__bindIn(MainBlockNode, 3, 3, 0); // Bind sm_range_min
-    __visc__bindIn(MainBlockNode, 4, 4, 0); // Bind sm_range_max
-    __visc__bindIn(MainBlockNode, 5, 5, 0); // Bind histo_height
-    __visc__bindIn(MainBlockNode, 6, 6, 0); // Bind histo_width
-    __visc__bindIn(MainBlockNode, 7, 7, 0); // Bind global_subhisto
-    __visc__bindIn(MainBlockNode, 8, 8, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(MainBlockNode, 9, 9, 0); // Bind global_histo
-    __visc__bindIn(MainBlockNode, 10, 10, 0); // Bind bytes_global_histo
-    __visc__bindIn(MainBlockNode, 11, 11, 0); // Bind overflow
-    __visc__bindIn(MainBlockNode, 12, 12, 0); // Bind bytes_overflow
-    __visc__bindIn(MainBlockNode, 13, 13, 0); // Bind blockx
-    __visc__bindIn(MainBlockNode, 14, 14, 0); // Bind blocky
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned char* sm_mappings;
-    size_t bytes_sm_mappings;
-    unsigned int num_elements;
-    unsigned int sm_range_min;
-    unsigned int sm_range_max;
-    unsigned int histo_height;
-    unsigned int histo_width;
-    unsigned int* global_subhisto;
-    size_t bytes_global_subhisto;
-    unsigned int* global_histo;
-    size_t bytes_global_histo;
-    unsigned int* global_overflow;
-    size_t bytes_global_overflow;
-    int blockx;
-    int blocky;
-    int gridx;
-    int gridy;
-}
-MainRootIn;
-
-
-void MainPackData(MainRootIn* args,
-                  unsigned char* sm_mappings, size_t bytes_sm_mappings,
-                  unsigned int num_elements,
-                  unsigned int sm_range_min,
-                  unsigned int sm_range_max,
-                  unsigned int histo_height,
-                  unsigned int histo_width,
-                  unsigned int* global_subhisto, size_t bytes_global_subhisto,
-                  unsigned int* global_histo, size_t bytes_global_histo,
-                  unsigned int* global_overflow, size_t bytes_global_overflow,
-                  int blockx, int blocky,
-                  int gridx, int gridy
-                 ) {
-    args->sm_mappings = sm_mappings;
-    args->bytes_sm_mappings = bytes_sm_mappings;
-    args->num_elements = num_elements;
-    args->sm_range_min = sm_range_min;
-    args->sm_range_max = sm_range_max;
-    args->histo_height = histo_height;
-    args->histo_width = histo_width;
-    args->global_subhisto = global_subhisto;
-    args->bytes_global_subhisto = bytes_global_subhisto;
-    args->global_histo = global_histo;
-    args->bytes_global_histo = bytes_global_histo;
-    args->global_overflow = global_overflow;
-    args->bytes_global_overflow = bytes_global_overflow;
-    args->blockx = blockx;
-    args->blocky = blocky;
-    args->gridx = gridx;
-    args->gridy = gridy;
-}
-
-// Final
-typedef struct __attribute__((__packed__)) {
-    unsigned short x;
-    unsigned short y;
-    unsigned short z;
-    unsigned short w;
-}
-ushort4;
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int x;
-    unsigned int y;
-    unsigned int z;
-    unsigned int w;
-}
-uint4;
-/* Combine all the sub-histogram results into one final histogram */
-void FinalLeaf (
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo) //final output
-{
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-
-
-  
-    unsigned int blockDimx = dimx;
-    unsigned int gridDimx = gdimx;
-    unsigned int start_offset = lx + gx * blockDimx;
-    const ushort4 zero_short  = {0, 0, 0, 0};
-    const uint4 zero_int      = {0, 0, 0, 0};
-
-    unsigned int size_low_histo = sm_range_min * BINS_PER_BLOCK;
-    unsigned int size_mid_histo = (sm_range_max - sm_range_min +1) * BINS_PER_BLOCK;
-
-    /* Clear lower region of global histogram */
-    for (unsigned int i = start_offset; i < size_low_histo/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-        
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the middle region of the overflow buffer */
-    for (unsigned int i = (size_low_histo/4) + start_offset; i < (size_low_histo+size_mid_histo)/4; i += gridDimx * blockDimx)
-    {
-        uint4 global_histo_data = ((uint4*)global_overflow)[i];
-        //((uint4*)global_overflow)[i] = zero_int;
-        global_overflow[i*4] = 0;
-        global_overflow[i*4+1] = 0;
-        global_overflow[i*4+2] = 0;
-        global_overflow[i*4+3] = 0;
-
-        uint4 internal_histo_data = (uint4){
-            global_histo_data.x,
-            global_histo_data.y,
-            global_histo_data.z,
-            global_histo_data.w
-        };
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_X; j++)
-        {
-            unsigned int bin4in = ((unsigned int*)global_subhisto)[i + j * histo_height * histo_width / 4];
-            internal_histo_data.x += (bin4in >>  0) & 0xFF;
-            internal_histo_data.y += (bin4in >>  8) & 0xFF;
-            internal_histo_data.z += (bin4in >> 16) & 0xFF;
-            internal_histo_data.w += (bin4in >> 24) & 0xFF;
-        }
-
-        internal_histo_data.x = min (internal_histo_data.x, (uint) 255);
-        internal_histo_data.y = min (internal_histo_data.y, (uint) 255);
-        internal_histo_data.z = min (internal_histo_data.z, (uint) 255);
-        internal_histo_data.w = min (internal_histo_data.w, (uint) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) internal_histo_data.x,
-            (unsigned char) internal_histo_data.y,
-            (unsigned char) internal_histo_data.z,
-            (unsigned char) internal_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-
-    /* Clear the upper region of global histogram */
-    for (unsigned int i = ((size_low_histo+size_mid_histo)/4) + start_offset; i < (histo_height*histo_width)/4; i += gridDimx * blockDimx)
-    {
-        ushort4 global_histo_data = ((ushort4*)global_histo)[i];
-        ((ushort4*)global_histo)[i] = zero_short;
-
-        global_histo_data.x = min (global_histo_data.x, (ushort) 255);
-        global_histo_data.y = min (global_histo_data.y, (ushort) 255);
-        global_histo_data.z = min (global_histo_data.z, (ushort) 255);
-        global_histo_data.w = min (global_histo_data.w, (ushort) 255);
-
-        uchar4 final_histo_data = (uchar4) {
-            (unsigned char) global_histo_data.x,
-            (unsigned char) global_histo_data.y,
-            (unsigned char) global_histo_data.z,
-            (unsigned char) global_histo_data.w
-        };
-
-        ((uchar4*)final_histo)[i] = final_histo_data;
-    }
-}
-
-void FinalBlock(
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-    
-    void* FinalLeafNode = __visc__createNode1D(FinalLeaf, block);
-
-    // Bind Inputs
-    __visc__bindIn(FinalLeafNode, 0, 0, 0); // Bind sm_range_min
-    __visc__bindIn(FinalLeafNode, 1, 1, 0); // Bind sm_range_max
-    __visc__bindIn(FinalLeafNode, 2, 2, 0); // Bind histo_height
-    __visc__bindIn(FinalLeafNode, 3, 3, 0); // Bind histo_width
-    __visc__bindIn(FinalLeafNode, 4, 4, 0); // Bind global_subhisto
-    __visc__bindIn(FinalLeafNode, 5, 5, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(FinalLeafNode, 6, 6, 0); // Bind global_histo
-    __visc__bindIn(FinalLeafNode, 7, 7, 0); // Bind bytes_global_histo
-    __visc__bindIn(FinalLeafNode, 8, 8, 0); // Bind global_overflow
-    __visc__bindIn(FinalLeafNode, 9, 9, 0); // Bind bytes_global_overflow
-    __visc__bindIn(FinalLeafNode, 10, 10, 0); // Bind final_histo
-    __visc__bindIn(FinalLeafNode, 11, 11, 0); // Bind bytes_final_histo
-}
-
-void FinalRoot(
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block,
-    int grid) {
-
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(3, global_subhisto, global_histo, global_overflow, 1, final_histo);
-    
-    void* FinalBlockNode = __visc__createNode1D(FinalBlock, grid);
-
-    // Bind Inputs
-    __visc__bindIn(FinalBlockNode, 0, 0, 0); // Bind sm_range_min
-    __visc__bindIn(FinalBlockNode, 1, 1, 0); // Bind sm_range_max
-    __visc__bindIn(FinalBlockNode, 2, 2, 0); // Bind histo_height
-    __visc__bindIn(FinalBlockNode, 3, 3, 0); // Bind histo_width
-    __visc__bindIn(FinalBlockNode, 4, 4, 0); // Bind global_subhisto
-    __visc__bindIn(FinalBlockNode, 5, 5, 0); // Bind bytes_global_subhisto
-    __visc__bindIn(FinalBlockNode, 6, 6, 0); // Bind global_histo
-    __visc__bindIn(FinalBlockNode, 7, 7, 0); // Bind bytes_global_histo
-    __visc__bindIn(FinalBlockNode, 8, 8, 0); // Bind global_overflow
-    __visc__bindIn(FinalBlockNode, 9, 9, 0); // Bind bytes_global_overflow
-    __visc__bindIn(FinalBlockNode, 10, 10, 0); // Bind final_histo
-    __visc__bindIn(FinalBlockNode, 11, 11, 0); // Bind bytes_final_histo
-    __visc__bindIn(FinalBlockNode, 12, 12, 0); // Bind block
-}
-
-typedef struct __attribute__((__packed__)) {
-    unsigned int sm_range_min; 
-    unsigned int sm_range_max;
-    unsigned int histo_height; 
-    unsigned int histo_width;
-    unsigned int *global_subhisto;
-    size_t bytes_global_subhisto;
-    unsigned int *global_histo;
-    size_t bytes_global_histo;
-    unsigned int *global_overflow;
-    size_t bytes_global_overflow;
-    unsigned int *final_histo;
-    size_t bytes_final_histo; //final output
-    int block;
-    int grid;
-}
-FinalRootIn;
-
-void FinalPackData( FinalRootIn* args,
-    unsigned int sm_range_min, 
-    unsigned int sm_range_max,
-    unsigned int histo_height, 
-    unsigned int histo_width,
-    unsigned int *global_subhisto, size_t bytes_global_subhisto,
-    unsigned int *global_histo, size_t bytes_global_histo,
-    unsigned int *global_overflow, size_t bytes_global_overflow,
-    unsigned int *final_histo, size_t bytes_final_histo, //final output
-    int block,
-    int grid) {
-  args->sm_range_min = sm_range_min; 
-  args->sm_range_max = sm_range_max;
-  args->histo_height = histo_height; 
-  args->histo_width = histo_width;
-  args->global_subhisto = global_subhisto;
-  args->bytes_global_subhisto = bytes_global_subhisto;
-  args->global_histo = global_histo;
-  args->bytes_global_histo = bytes_global_histo;
-  args->global_overflow = global_overflow;
-  args->bytes_global_overflow = bytes_global_overflow;
-  args->final_histo = final_histo;
-  args->bytes_final_histo = bytes_final_histo; //final output
-  args->block = block;
-  args->grid = grid;
-}
-
-
-void TopNode () {
-}
-
-int main(int argc, char* argv[]) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    parameters = pb_ReadParameters(&argc, argv);
-    if (!parameters)
-        return -1;
-
-    if(!parameters->inpFiles[0]) {
-        fputs("Input file expected\n", stderr);
-        return -1;
-    }
-
-    char viscOverhead[] = "viscOverhead";
-    char prescans[] = "PreScanKernel";
-    char postpremems[] = "PostPreMems";
-    char intermediates[] = "IntermediatesKernel";
-    char mains[] = "MainKernel";
-    char finals[] = "FinalKernel";
-
-    int numIterations;
-    if (argc >= 2) {
-        numIterations = atoi(argv[1]);
-    } else {
-        fputs("Expected at least one command line argument\n", stderr);
-        return -1;
-    }
-
-    unsigned int img_width, img_height;
-    unsigned int histo_width, histo_height;
-    unsigned int nThreads;
-
-    FILE* f = fopen(parameters->inpFiles[0],"rb");
-    int result = 0;
-
-    result += fread(&img_width,    sizeof(unsigned int), 1, f);
-    result += fread(&img_height,   sizeof(unsigned int), 1, f);
-    result += fread(&histo_width,  sizeof(unsigned int), 1, f);
-    result += fread(&histo_height, sizeof(unsigned int), 1, f);
-
-    if (result != 4) {
-        fputs("Error reading input and output dimensions from file\n", stderr);
-        return -1;
-    }
-
-    unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
-    unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));
-
-    result = fread(img, sizeof(unsigned int), img_width*img_height, f);
-
-    fclose(f);
-
-    if (result != img_width*img_height) {
-        fputs("Error reading input array from file\n", stderr);
-        return -1;
-    }
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-    pb_AddSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, prescans, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, postpremems, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-    pb_AddSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-
-    void* histo_prescan_dfg;
-    void* histo_intermediates_dfg;
-    void* histo_main_dfg;
-    void* histo_final_kernel;
-
-    int even_width = ((img_width+1)/2)*2;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    size_t bytes_input = even_width*(((img_height+UNROLL-1)/UNROLL)*UNROLL)*sizeof(unsigned int);
-    size_t bytes_ranges = 2*sizeof(unsigned int);
-    size_t bytes_sm_mappings = img_width*img_height*4*sizeof(unsigned char);
-    size_t bytes_global_subhisto = img_width*histo_height*sizeof(unsigned int);
-    size_t bytes_global_histo = img_width*histo_height*sizeof(unsigned short);
-    size_t bytes_global_overflow = img_width*histo_height*sizeof(unsigned int);
-    size_t bytes_final_histo = img_width*histo_height*sizeof(unsigned char);
-
-    unsigned* input = (unsigned*) malloc(bytes_input);
-    unsigned* ranges = (unsigned*) malloc(bytes_ranges);
-    unsigned char* sm_mappings = (unsigned char*) malloc(bytes_sm_mappings);
-    unsigned int* global_subhisto = (unsigned int*) malloc(bytes_global_subhisto);
-    unsigned short* global_histo = (unsigned short*) malloc(bytes_global_histo);
-    unsigned int* global_overflow = (unsigned int*) malloc(bytes_global_overflow);
-    unsigned char* final_histo = (unsigned char*) malloc(bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(input, bytes_input);
-    llvm_visc_track_mem(ranges, bytes_ranges);
-    llvm_visc_track_mem(sm_mappings, bytes_sm_mappings);
-    llvm_visc_track_mem(global_subhisto, bytes_global_subhisto);
-    llvm_visc_track_mem(global_histo, bytes_global_histo);
-    llvm_visc_track_mem(global_overflow, bytes_global_overflow);
-    llvm_visc_track_mem(final_histo, bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    switch (KB) {
-    case 48:
-        nThreads = 1024;
-        break;
-    case 24:
-        nThreads = 768;
-        break;
-    default:
-        nThreads = 512;
-        break;
-    }
-
-
-    // FIXME: How to do padding without these many copies
-    for (int y=0; y < img_height; y++) {
-        memcpy(&input[y*even_width], &img[y*img_width], img_width*sizeof(unsigned int));
-    }
-
-    int prescan_localWS = PRESCAN_THREADS;
-    int prescan_globalWS = PRESCAN_BLOCKS_X;
-    int inter_localWS = (img_width+1)/2;
-    int inter_globalWS = ((img_height + UNROLL-1)/UNROLL);
-    int main_localWS[2] = {(int)nThreads, 1};
-    int main_globalWS[2];
-    main_globalWS[0] = BLOCK_X;
-    int final_localWS = 512;
-    int final_globalWS = BLOCK_X*3;
-
-    pb_SwitchToTimer(&timers, visc_TimerID_ARG_PACK);
-
-    unsigned int img_dim = img_height*img_width;
-    PrescanRootIn* argsP = (PrescanRootIn*) malloc(sizeof(PrescanRootIn));
-    PrescanPackData(argsP,
-                    input, bytes_input,
-                    img_dim,
-                    ranges, bytes_ranges,
-                    prescan_localWS, prescan_globalWS);
-
-    unsigned int half_width = (img_width+1)/2;
-
-    IntermediatesRootIn* argsI = (IntermediatesRootIn*) malloc(sizeof(IntermediatesRootIn));
-    IntermediatesPackData(argsI,
-                          input, bytes_input,
-                          img_height,
-                          img_width,
-                          half_width,
-                          sm_mappings, bytes_sm_mappings,
-                          inter_localWS, inter_globalWS);
-
-    MainRootIn* argsM = (MainRootIn*) malloc (sizeof(MainRootIn));
-    MainPackData( argsM,
-                  sm_mappings, bytes_sm_mappings,
-                  img_dim,
-                  ranges[0], // This would actually be set inside loop
-                  ranges[1], // This would actually be set inside loop
-                  histo_height,
-                  histo_width,
-                  global_subhisto, bytes_global_subhisto,
-                  (unsigned int*)global_histo, bytes_global_histo,
-                  global_overflow, bytes_global_overflow,
-                  main_localWS[0], main_localWS[1],
-                  main_globalWS[0], main_globalWS[1]
-                  );
-
-
-    FinalRootIn* argsF = (FinalRootIn*) malloc(sizeof(FinalRootIn));
-    FinalPackData(argsF,
-                  ranges[0], // This would actually be set inside loop
-                  ranges[1], // This would actually be set inside loop
-                  histo_height,
-                  histo_width,
-                  global_subhisto, bytes_global_subhisto,
-                  (unsigned int*)global_histo, bytes_global_histo,
-                  global_overflow, bytes_global_overflow,
-                  (unsigned int*)final_histo, bytes_final_histo,
-                  final_localWS, final_globalWS
-                  );
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-
-#ifndef debug
-    char inputf[] = "input-visc";
-    char sm_mappingsf[] = "sm_mappings-visc";
-    char global_subhistof[] = "global_subhisto-visc";
-    char global_histof[] = "global_histo-visc";
-    char global_overflowf[] = "global_overflow-visc";
-    numIterations = 1;
-#endif
-
-    for (int iter = 0; iter < numIterations; iter++) {
-        llvm_visc_request_mem(ranges, bytes_ranges);
-        //llvm_visc_request_mem(input, bytes_input);
-        ranges[0] = UINT32_MAX;
-        ranges[1] = 0;
-
-        // how about something like
-        // __global__ unsigned int ranges[2];
-        // ...kernel
-        // __shared__ unsigned int s_ranges[2];
-        // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
-        // __syncthreads();
-
-        // Although then removing the blocking cudaMemcpy's might cause something about
-        // concurrent kernel execution.
-        // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?
-
-#ifndef debug        
-        dump_histo_img((unsigned char*)input, 1, bytes_input, inputf);
-#endif
-        pb_SwitchToSubTimer(&timers, prescans , visc_TimerID_COMPUTATION);
-        void* PrescanDFG = __visc__launch(0, PrescanRoot, (void*)argsP);
-        __visc__wait(PrescanDFG);
-
-        pb_SwitchToSubTimer(&timers, postpremems , visc_TimerID_COMPUTATION);
-        
-        llvm_visc_request_mem(ranges, bytes_ranges);
-
-        //printf("Range: (%d, %d)\n", ranges[0], ranges[1]);
-#ifndef debug
-        printf("Range: (%d, %d)\n", ranges[0], ranges[1]);
-#endif
-        // Requesting only so that we can write it to zero. IT should not be
-        // copied from device to host
-        llvm_visc_request_mem(global_subhisto, bytes_global_subhisto);
-        // Set global_subhisto to zero
-        memset(global_subhisto, 0, img_width*histo_height*sizeof(unsigned int));
-
-        pb_SwitchToSubTimer(&timers, intermediates, visc_TimerID_COMPUTATION);
-
-        void* IntermediatesDFG = __visc__launch(0, IntermediatesRoot, (void*)argsI);
-        __visc__wait(IntermediatesDFG);
-
-        pb_SwitchToSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-#ifndef debug
-        llvm_visc_request_mem(sm_mappings, bytes_sm_mappings);
-        dump_histo_img(sm_mappings, 1, bytes_sm_mappings, sm_mappingsf);
-#endif
-        argsM->gridy = ranges[1]-ranges[0]+1;
-
-        argsM->sm_range_min = ranges[0];
-        argsM->sm_range_max = ranges[1];
-        argsF->sm_range_min = ranges[0];
-        argsF->sm_range_max = ranges[1];
-        
-        pb_SwitchToSubTimer(&timers, mains, visc_TimerID_COMPUTATION);
-
-        void* MainDFG = __visc__launch(0, MainRoot, (void*) argsM);
-        __visc__wait(MainDFG);
-#ifndef debug
-        llvm_visc_request_mem(global_subhisto, bytes_global_subhisto);
-        dump_histo_img((unsigned char*)global_subhisto, 1, bytes_global_subhisto, global_subhistof);
-
-        llvm_visc_request_mem(global_histo, bytes_global_histo);
-        dump_histo_img((unsigned char*)global_histo, 1, bytes_global_histo, global_histof);
-
-        llvm_visc_request_mem(global_overflow, bytes_global_overflow);
-        dump_histo_img((unsigned char*)global_overflow, 1, bytes_global_overflow, global_overflowf);
-#endif
-        pb_SwitchToSubTimer(&timers, finals, visc_TimerID_COMPUTATION);
-
-        void* FinalDFG = __visc__launch(0, FinalRoot, (void*) argsF);
-        __visc__wait(FinalDFG);
-
-        pb_SwitchToSubTimer(&timers, viscOverhead, visc_TimerID_COMPUTATION);
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(final_histo, bytes_final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    llvm_visc_untrack_mem(input);
-    llvm_visc_untrack_mem(ranges);
-    llvm_visc_untrack_mem(sm_mappings);
-    llvm_visc_untrack_mem(global_subhisto);
-    llvm_visc_untrack_mem(global_histo);
-    llvm_visc_untrack_mem(global_overflow);
-    llvm_visc_untrack_mem(final_histo);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (parameters->outFile) {
-        dump_histo_img(final_histo, histo_height, histo_width, parameters->outFile);
-    }
-
-    //pb_SwitchToTimer(&timers, visc_TimerID_MEM_FREE);
-
-    free(img);
-    free(input);
-    free(ranges);
-    free(sm_mappings);
-    free(global_subhisto);
-    free(global_histo);
-    free(global_overflow);
-    free(final_histo);
-
-    //pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    printf("\n");
-    pb_FreeParameters(parameters);
-
-    pb_DestroyTimerSet(&timers);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/histo/tools/compare-output b/hpvm/test/parboil/benchmarks/histo/tools/compare-output
deleted file mode 100755
index 05c7127f33..0000000000
--- a/hpvm/test/parboil/benchmarks/histo/tools/compare-output
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-# (c) Copyright 2010 The Board of Trustees of the University of Illinois.
-
-cmp $1 $2 &> /dev/null
-
-if [ $? -eq 0 ]         # Test exit status of "cmp" command.
-then
-  echo "Pass"
-else  
-  echo "Mismatch"
-  exit 1
-fi
diff --git a/hpvm/test/parboil/benchmarks/kmeans/Makefile b/hpvm/test/parboil/benchmarks/kmeans/Makefile
deleted file mode 100644
index dae329b017..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = kmeans
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = visc
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST=0
-endif
-
-ifeq ($(TEST),0)
-  FILE=100
-else ifeq ($(TEST),1)
-  FILE=204800
-else ifeq ($(TEST),2)
-  FILE=819200
-else ifeq ($(TEST),3)
-  FILE=kdd_cup
-endif
-
-ifeq ($(PLATFORM),)
-PLATFORM=default
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-INPUT = $(DATASET_DIR)/$(TEST)/input/$(FILE)
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/$(FILE).out
-RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/$(FILE).out
-
-ARGS = -i $(INPUT) -o $(OUTPUT)
-TOOL = tools/compare-output
-
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/Makefile b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/Makefile
deleted file mode 100755
index 255f7328c8..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=kmeans.o cluster.o getopt.o read_input.o kmeans_clustering.o rmse.o #compute_gold.o
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS= -O2
-APP_CXXFLAGS= -O2
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/README b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/README
deleted file mode 100755
index 05c5443196..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/README
+++ /dev/null
@@ -1,9 +0,0 @@
-NOTE: The current Kmeans implementation doesn't use texture/constant memories, and is different from the CUDA implementation.
-
-******Adjustable work group size*****
-RD_WG_SIZE_0 or RD_WG_SIZE_0_0 for kernel_swap
-RD_WG_SIZE_1 or RD_WG_SIZE_1_0 for kernel_kmeans 
-
-USAGE:
-make clean
-make KERNEL_DIM="-DRD_WG_SIZE_0=128 -DRD_WG_SIZE_1=512"
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/cluster.c b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/cluster.c
deleted file mode 100644
index 49c105b1e0..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/cluster.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         cluster.c                                           **/
-/**   Description:  Takes as input a file, containing 1 data point per  **/
-/**                 per line, and performs a fuzzy c-means clustering   **/
-/**                 on the data. Fuzzy clustering is performed using    **/
-/**                 min to max clusters and the clustering that gets    **/
-/**                 the best score according to a compactness and       **/
-/**                 separation criterion are returned.                  **/
-/**   Author:  Brendan McCane                                           **/
-/**            James Cook University of North Queensland.               **/
-/**            Australia. email: mccane@cs.jcu.edu.au                   **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath, Wei-keng Liao                           **/
-/**              Northwestern University.                               **/
-/**																		**/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <math.h>
-#include <float.h>
-
-#include "kmeans.h"
-
-float	min_rmse_ref = FLT_MAX;
-extern double wtime(void);
-/* reference min_rmse value */
-
-/*---< cluster() >-----------------------------------------------------------*/
-int cluster(int      npoints,				/* number of data points */
-            int      nfeatures,				/* number of attributes for each point */
-            float  **features,			/* array: [npoints][nfeatures] */
-            int      min_nclusters,			/* range of min to max number of clusters */
-            int		 max_nclusters,
-            float    threshold,				/* loop terminating factor */
-            int     *best_nclusters,		/* out: number between min and max with lowest RMSE */
-            float ***cluster_centres,		/* out: [best_nclusters][nfeatures] */
-            float	*min_rmse,				/* out: minimum RMSE */
-            int		 isRMSE,				/* calculate RMSE */
-            int		 nloops					/* number of iteration for each number of clusters */
-           )
-{
-    int		nclusters;						/* number of clusters k */
-    int		index =0;						/* number of iteration to reach the best RMSE */
-    int		rmse;							/* RMSE for each clustering */
-    int    *membership;						/* which cluster a data point belongs to */
-    float **tmp_cluster_centres;			/* hold coordinates of cluster centers */
-    int		i;
-
-    /* allocate memory for membership */
-    membership = (int*) malloc(npoints * sizeof(int));
-
-    /* sweep k from min to max_nclusters to find the best number of clusters */
-    for(nclusters = min_nclusters; nclusters <= max_nclusters; nclusters++)
-    {
-        if (nclusters > npoints) break;	/* cannot have more clusters than points */
-
-        /* allocate device memory, invert data array (@ kmeans_cuda.cu) */
-        allocate(npoints, nfeatures, nclusters, features);
-
-        /* iterate nloops times for each number of clusters */
-        for(i = 0; i < nloops; i++)
-        {
-            /* initialize initial cluster centers, CUDA calls (@ kmeans_cuda.cu) */
-            tmp_cluster_centres = kmeans_clustering(features,
-                                                    nfeatures,
-                                                    npoints,
-                                                    nclusters,
-                                                    threshold,
-                                                    membership);
-
-            if (*cluster_centres) {
-                free((*cluster_centres)[0]);
-                free(*cluster_centres);
-            }
-            *cluster_centres = tmp_cluster_centres;
-
-
-            /* find the number of clusters with the best RMSE */
-            if(isRMSE)
-            {
-                rmse = rms_err(features,
-                               nfeatures,
-                               npoints,
-                               tmp_cluster_centres,
-                               nclusters);
-
-                if(rmse < min_rmse_ref) {
-                    min_rmse_ref = rmse;			//update reference min RMSE
-                    *min_rmse = min_rmse_ref;		//update return min RMSE
-                    *best_nclusters = nclusters;	//update optimum number of clusters
-                    index = i;						//update number of iteration to reach best RMSE
-                }
-            }
-        }
-
-        deallocateMemory();							/* free device memory (@ kmeans_cuda.cu) */
-    }
-
-    free(membership);
-
-    return index;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.c b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.c
deleted file mode 100644
index 23262ae282..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.c
+++ /dev/null
@@ -1,1184 +0,0 @@
-/* Getopt for GNU.
-   NOTE: getopt is now part of the C library, so if you don't know what
-   "Keep this file name-space clean" means, talk to drepper@gnu.org
-   before changing it!
-   Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001
-        Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
-   Ditto for AIX 3.2 and <stdlib.h>.  */
-#ifndef _NO_PROTO
-# define _NO_PROTO
-#endif
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#if !defined __STDC__ || !__STDC__
-/* This is a separate conditional since some stdc systems
-   reject `defined (const)'.  */
-# ifndef const
-#  define const
-# endif
-#endif
-
-#include <stdio.h>
-
-/* Comment out all this code if we are using the GNU C Library, and are not
-   actually compiling the library itself.  This code is part of the GNU C
-   Library, but also included in many other GNU distributions.  Compiling
-   and linking in this code is a waste when using the GNU C library
-   (especially if it is a shared library).  Rather than having every GNU
-   program understand `configure --with-gnu-libc' and omit the object files,
-   it is simpler to just do this in the source for each such file.  */
-
-#define GETOPT_INTERFACE_VERSION 2
-#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
-# include <gnu-versions.h>
-# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
-#  define ELIDE_CODE
-# endif
-#endif
-
-#ifndef ELIDE_CODE
-
-
-/* This needs to come after some library #include
-   to get __GNU_LIBRARY__ defined.  */
-#ifdef  __GNU_LIBRARY__
-/* Don't include stdlib.h for non-GNU C libraries because some of them
-   contain conflicting prototypes for getopt.  */
-# include <stdlib.h>
-# include <unistd.h>
-#endif  /* GNU C library.  */
-
-#ifdef VMS
-# include <unixlib.h>
-# if HAVE_STRING_H - 0
-#  include <string.h>
-# endif
-#endif
-
-#ifndef _
-/* This is for other GNU distributions with internationalized messages.  */
-# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
-#  include <libintl.h>
-#  ifndef _
-#   define _(msgid)     gettext (msgid)
-#  endif
-# else
-#  define _(msgid)      (msgid)
-# endif
-# if defined _LIBC && defined USE_IN_LIBIO
-#  include <wchar.h>
-# endif
-#endif
-
-/* This version of `getopt' appears to the caller like standard Unix `getopt'
-   but it behaves differently for the user, since it allows the user
-   to intersperse the options with the other arguments.
-
-   As `getopt' works, it permutes the elements of ARGV so that,
-   when it is done, all the options precede everything else.  Thus
-   all application programs are extended to handle flexible argument order.
-
-   Setting the environment variable POSIXLY_CORRECT disables permutation.
-   Then the behavior is completely standard.
-
-   GNU application programs can use a third alternative mode in which
-   they can distinguish the relative order of options and other arguments.  */
-
-#include "getopt.h"
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-/* 1003.2 says this must be 1 before any call.  */
-int optind = 1;
-
-/* Formerly, initialization of getopt depended on optind==0, which
-   causes problems with re-calling getopt as programs generally don't
-   know that. */
-
-int __getopt_initialized;
-
-/* The next char to be scanned in the option-element
-   in which the last option character we returned was found.
-   This allows us to pick up the scan where we left off.
-
-   If this is zero, or a null string, it means resume the scan
-   by advancing to the next ARGV-element.  */
-
-static char *nextchar;
-
-/* Callers store zero here to inhibit the error message
-   for unrecognized options.  */
-
-int opterr = 1;
-
-/* Set to an option character which was unrecognized.
-   This must be initialized on some systems to avoid linking in the
-   system's own getopt implementation.  */
-
-int optopt = '?';
-
-/* Describe how to deal with options that follow non-option ARGV-elements.
-
-   If the caller did not specify anything,
-   the default is REQUIRE_ORDER if the environment variable
-   POSIXLY_CORRECT is defined, PERMUTE otherwise.
-
-   REQUIRE_ORDER means don't recognize them as options;
-   stop option processing when the first non-option is seen.
-   This is what Unix does.
-   This mode of operation is selected by either setting the environment
-   variable POSIXLY_CORRECT, or using `+' as the first character
-   of the list of option characters.
-
-   PERMUTE is the default.  We permute the contents of ARGV as we scan,
-   so that eventually all the non-options are at the end.  This allows options
-   to be given in any order, even with programs that were not written to
-   expect this.
-
-   RETURN_IN_ORDER is an option available to programs that were written
-   to expect options and other ARGV-elements in any order and that care about
-   the ordering of the two.  We describe each non-option ARGV-element
-   as if it were the argument of an option with character code 1.
-   Using `-' as the first character of the list of option characters
-   selects this mode of operation.
-
-   The special argument `--' forces an end of option-scanning regardless
-   of the value of `ordering'.  In the case of RETURN_IN_ORDER, only
-   `--' can cause `getopt' to return -1 with `optind' != ARGC.  */
-
-static enum
-{
-    REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
-} ordering;
-
-/* Value of POSIXLY_CORRECT environment variable.  */
-static char *posixly_correct;
-
-#ifdef  __GNU_LIBRARY__
-/* We want to avoid inclusion of string.h with non-GNU libraries
-   because there are many ways it can cause trouble.
-   On some systems, it contains special magic macros that don't work
-   in GCC.  */
-# include <string.h>
-# define my_index       strchr
-#else
-
-//# if HAVE_STRING_H || WIN32 /* Pete Wilson mod 7/28/02 */
-#  include <string.h>
-//# else
-//#  include <strings.h>
-//# endif
-
-/* Avoid depending on library functions or files
-   whose names are inconsistent.  */
-
-#ifndef getenv
-extern char *getenv ();
-#endif
-
-static char *
-my_index (str, chr)
-const char *str;
-int chr;
-{
-    while (*str)
-    {
-        if (*str == chr)
-            return (char *) str;
-        str++;
-    }
-    return 0;
-}
-
-/* If using GCC, we can safely declare strlen this way.
-   If not using GCC, it is ok not to declare it.  */
-#ifdef __GNUC__
-/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
-   That was relevant to code that was here before.  */
-# if (!defined __STDC__ || !__STDC__) && !defined strlen
-/* gcc with -traditional declares the built-in strlen to return int,
-   and has done so at least since version 2.4.5. -- rms.  */
-extern int strlen (const char *);
-# endif /* not __STDC__ */
-#endif /* __GNUC__ */
-
-#endif /* not __GNU_LIBRARY__ */
-
-/* Handle permutation of arguments.  */
-
-/* Describe the part of ARGV that contains non-options that have
-   been skipped.  `first_nonopt' is the index in ARGV of the first of them;
-   `last_nonopt' is the index after the last of them.  */
-
-static int first_nonopt;
-static int last_nonopt;
-
-#ifdef _LIBC
-/* Stored original parameters.
-   XXX This is no good solution.  We should rather copy the args so
-   that we can compare them later.  But we must not use malloc(3).  */
-extern int __libc_argc;
-extern char **__libc_argv;
-
-/* Bash 2.0 gives us an environment variable containing flags
-   indicating ARGV elements that should not be considered arguments.  */
-
-# ifdef USE_NONOPTION_FLAGS
-/* Defined in getopt_init.c  */
-extern char *__getopt_nonoption_flags;
-
-static int nonoption_flags_max_len;
-static int nonoption_flags_len;
-# endif
-
-# ifdef USE_NONOPTION_FLAGS
-#  define SWAP_FLAGS(ch1, ch2) \
-  if (nonoption_flags_len > 0)                                                \
-    {                                                                         \
-      char __tmp = __getopt_nonoption_flags[ch1];                             \
-      __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2];          \
-      __getopt_nonoption_flags[ch2] = __tmp;                                  \
-    }
-# else
-#  define SWAP_FLAGS(ch1, ch2)
-# endif
-#else   /* !_LIBC */
-# define SWAP_FLAGS(ch1, ch2)
-#endif  /* _LIBC */
-
-/* Exchange two adjacent subsequences of ARGV.
-   One subsequence is elements [first_nonopt,last_nonopt)
-   which contains all the non-options that have been skipped so far.
-   The other is elements [last_nonopt,optind), which contains all
-   the options processed since those non-options were skipped.
-
-   `first_nonopt' and `last_nonopt' are relocated so that they describe
-   the new indices of the non-options in ARGV after they are moved.  */
-
-#if defined __STDC__ && __STDC__
-static void exchange (char **);
-#endif
-
-static void
-exchange (argv)
-char **argv;
-{
-    int bottom = first_nonopt;
-    int middle = last_nonopt;
-    int top = optind;
-    char *tem;
-
-    /* Exchange the shorter segment with the far end of the longer segment.
-       That puts the shorter segment into the right place.
-       It leaves the longer segment in the right place overall,
-       but it consists of two parts that need to be swapped next.  */
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-    /* First make sure the handling of the `__getopt_nonoption_flags'
-       string can work normally.  Our top argument must be in the range
-       of the string.  */
-    if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len)
-    {
-        /* We must extend the array.  The user plays games with us and
-           presents new arguments.  */
-        char *new_str = malloc (top + 1);
-        if (new_str == NULL)
-            nonoption_flags_len = nonoption_flags_max_len = 0;
-        else
-        {
-            memset (__mempcpy (new_str, __getopt_nonoption_flags,
-                               nonoption_flags_max_len),
-                    '\0', top + 1 - nonoption_flags_max_len);
-            nonoption_flags_max_len = top + 1;
-            __getopt_nonoption_flags = new_str;
-        }
-    }
-#endif
-
-    while (top > middle && middle > bottom)
-    {
-        if (top - middle > middle - bottom)
-        {
-            /* Bottom segment is the short one.  */
-            int len = middle - bottom;
-            register int i;
-
-            /* Swap it with the top part of the top segment.  */
-            for (i = 0; i < len; i++)
-            {
-                tem = argv[bottom + i];
-                argv[bottom + i] = argv[top - (middle - bottom) + i];
-                argv[top - (middle - bottom) + i] = tem;
-                SWAP_FLAGS (bottom + i, top - (middle - bottom) + i);
-            }
-            /* Exclude the moved bottom segment from further swapping.  */
-            top -= len;
-        }
-        else
-        {
-            /* Top segment is the short one.  */
-            int len = top - middle;
-            register int i;
-
-            /* Swap it with the bottom part of the bottom segment.  */
-            for (i = 0; i < len; i++)
-            {
-                tem = argv[bottom + i];
-                argv[bottom + i] = argv[middle + i];
-                argv[middle + i] = tem;
-                SWAP_FLAGS (bottom + i, middle + i);
-            }
-            /* Exclude the moved top segment from further swapping.  */
-            bottom += len;
-        }
-    }
-
-    /* Update records for the slots the non-options now occupy.  */
-
-    first_nonopt += (optind - last_nonopt);
-    last_nonopt = optind;
-}
-
-/* Initialize the internal data when the first call is made.  */
-
-#if defined __STDC__ && __STDC__
-static const char *_getopt_initialize (int, char *const *, const char *);
-#endif
-static const char *
-_getopt_initialize (argc, argv, optstring)
-int argc;
-char *const *argv;
-const char *optstring;
-{
-    /* Start processing options with ARGV-element 1 (since ARGV-element 0
-       is the program name); the sequence of previously skipped
-       non-option ARGV-elements is empty.  */
-
-    first_nonopt = last_nonopt = optind;
-
-    nextchar = NULL;
-
-    posixly_correct = getenv ("POSIXLY_CORRECT");
-
-    /* Determine how to handle the ordering of options and nonoptions.  */
-
-    if (optstring[0] == '-')
-    {
-        ordering = RETURN_IN_ORDER;
-        ++optstring;
-    }
-    else if (optstring[0] == '+')
-    {
-        ordering = REQUIRE_ORDER;
-        ++optstring;
-    }
-    else if (posixly_correct != NULL)
-        ordering = REQUIRE_ORDER;
-    else
-        ordering = PERMUTE;
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-    if (posixly_correct == NULL
-            && argc == __libc_argc && argv == __libc_argv)
-    {
-        if (nonoption_flags_max_len == 0)
-        {
-            if (__getopt_nonoption_flags == NULL
-                    || __getopt_nonoption_flags[0] == '\0')
-                nonoption_flags_max_len = -1;
-            else
-            {
-                const char *orig_str = __getopt_nonoption_flags;
-                int len = nonoption_flags_max_len = strlen (orig_str);
-                if (nonoption_flags_max_len < argc)
-                    nonoption_flags_max_len = argc;
-                __getopt_nonoption_flags =
-                    (char *) malloc (nonoption_flags_max_len);
-                if (__getopt_nonoption_flags == NULL)
-                    nonoption_flags_max_len = -1;
-                else
-                    memset (__mempcpy (__getopt_nonoption_flags, orig_str, len),
-                            '\0', nonoption_flags_max_len - len);
-            }
-        }
-        nonoption_flags_len = nonoption_flags_max_len;
-    }
-    else
-        nonoption_flags_len = 0;
-#endif
-
-    return optstring;
-}
-
-/* Scan elements of ARGV (whose length is ARGC) for option characters
-   given in OPTSTRING.
-
-   If an element of ARGV starts with '-', and is not exactly "-" or "--",
-   then it is an option element.  The characters of this element
-   (aside from the initial '-') are option characters.  If `getopt'
-   is called repeatedly, it returns successively each of the option characters
-   from each of the option elements.
-
-   If `getopt' finds another option character, it returns that character,
-   updating `optind' and `nextchar' so that the next call to `getopt' can
-   resume the scan with the following option character or ARGV-element.
-
-   If there are no more option characters, `getopt' returns -1.
-   Then `optind' is the index in ARGV of the first ARGV-element
-   that is not an option.  (The ARGV-elements have been permuted
-   so that those that are not options now come last.)
-
-   OPTSTRING is a string containing the legitimate option characters.
-   If an option character is seen that is not listed in OPTSTRING,
-   return '?' after printing an error message.  If you set `opterr' to
-   zero, the error message is suppressed but we still return '?'.
-
-   If a char in OPTSTRING is followed by a colon, that means it wants an arg,
-   so the following text in the same ARGV-element, or the text of the following
-   ARGV-element, is returned in `optarg'.  Two colons mean an option that
-   wants an optional arg; if there is text in the current ARGV-element,
-   it is returned in `optarg', otherwise `optarg' is set to zero.
-
-   If OPTSTRING starts with `-' or `+', it requests different methods of
-   handling the non-option ARGV-elements.
-   See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
-
-   Long-named options begin with `--' instead of `-'.
-   Their names may be abbreviated as long as the abbreviation is unique
-   or is an exact match for some defined option.  If they have an
-   argument, it follows the option name in the same ARGV-element, separated
-   from the option name by a `=', or else the in next ARGV-element.
-   When `getopt' finds a long-named option, it returns 0 if that option's
-   `flag' field is nonzero, the value of the option's `val' field
-   if the `flag' field is zero.
-
-   The elements of ARGV aren't really const, because we permute them.
-   But we pretend they're const in the prototype to be compatible
-   with other systems.
-
-   LONGOPTS is a vector of `struct option' terminated by an
-   element containing a name which is zero.
-
-   LONGIND returns the index in LONGOPT of the long-named option found.
-   It is only valid when a long-named option has been found by the most
-   recent call.
-
-   If LONG_ONLY is nonzero, '-' as well as '--' can introduce
-   long-named options.  */
-
-int
-_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
-int argc;
-char *const *argv;
-const char *optstring;
-const struct option *longopts;
-int *longind;
-int long_only;
-{
-    int print_errors = opterr;
-    if (optstring[0] == ':')
-        print_errors = 0;
-
-    if (argc < 1)
-        return -1;
-
-    optarg = NULL;
-
-    if (optind == 0 || !__getopt_initialized)
-    {
-        if (optind == 0)
-            optind = 1;     /* Don't scan ARGV[0], the program name.  */
-        optstring = _getopt_initialize (argc, argv, optstring);
-        __getopt_initialized = 1;
-    }
-
-    /* Test whether ARGV[optind] points to a non-option argument.
-       Either it does not have option syntax, or there is an environment flag
-       from the shell indicating it is not an option.  The later information
-       is only used when the used in the GNU libc.  */
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0'       \
-                      || (optind < nonoption_flags_len                        \
-                          && __getopt_nonoption_flags[optind] == '1'))
-#else
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0')
-#endif
-
-    if (nextchar == NULL || *nextchar == '\0')
-    {
-        /* Advance to the next ARGV-element.  */
-
-        /* Give FIRST_NONOPT and LAST_NONOPT rational values if OPTIND has been
-           moved back by the user (who may also have changed the arguments).  */
-        if (last_nonopt > optind)
-            last_nonopt = optind;
-        if (first_nonopt > optind)
-            first_nonopt = optind;
-
-        if (ordering == PERMUTE)
-        {
-            /* If we have just processed some options following some non-options,
-               exchange them so that the options come first.  */
-
-            if (first_nonopt != last_nonopt && last_nonopt != optind)
-                exchange ((char **) argv);
-            else if (last_nonopt != optind)
-                first_nonopt = optind;
-
-            /* Skip any additional non-options
-               and extend the range of non-options previously skipped.  */
-
-            while (optind < argc && NONOPTION_P)
-                optind++;
-            last_nonopt = optind;
-        }
-
-        /* The special ARGV-element `--' means premature end of options.
-           Skip it like a null option,
-           then exchange with previous non-options as if it were an option,
-           then skip everything else like a non-option.  */
-
-        if (optind != argc && !strcmp (argv[optind], "--"))
-        {
-            optind++;
-
-            if (first_nonopt != last_nonopt && last_nonopt != optind)
-                exchange ((char **) argv);
-            else if (first_nonopt == last_nonopt)
-                first_nonopt = optind;
-            last_nonopt = argc;
-
-            optind = argc;
-        }
-
-        /* If we have done all the ARGV-elements, stop the scan
-           and back over any non-options that we skipped and permuted.  */
-
-        if (optind == argc)
-        {
-            /* Set the next-arg-index to point at the non-options
-               that we previously skipped, so the caller will digest them.  */
-            if (first_nonopt != last_nonopt)
-                optind = first_nonopt;
-            return -1;
-        }
-
-        /* If we have come to a non-option and did not permute it,
-           either stop the scan or describe it to the caller and pass it by.  */
-
-        if (NONOPTION_P)
-        {
-            if (ordering == REQUIRE_ORDER)
-                return -1;
-            optarg = argv[optind++];
-            return 1;
-        }
-
-        /* We have found another option-ARGV-element.
-           Skip the initial punctuation.  */
-
-        nextchar = (argv[optind] + 1
-                    + (longopts != NULL && argv[optind][1] == '-'));
-    }
-
-    /* Decode the current option-ARGV-element.  */
-
-    /* Check whether the ARGV-element is a long option.
-
-       If long_only and the ARGV-element has the form "-f", where f is
-       a valid short option, don't consider it an abbreviated form of
-       a long option that starts with f.  Otherwise there would be no
-       way to give the -f short option.
-
-       On the other hand, if there's a long option "fubar" and
-       the ARGV-element is "-fu", do consider that an abbreviation of
-       the long option, just like "--fu", and not "-f" with arg "u".
-
-       This distinction seems to be the most useful approach.  */
-
-    if (longopts != NULL
-            && (argv[optind][1] == '-'
-                || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
-    {
-        char *nameend;
-        const struct option *p;
-        const struct option *pfound = NULL;
-        int exact = 0;
-        int ambig = 0;
-        int indfound = -1;
-        int option_index;
-
-        for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
-            /* Do nothing.  */ ;
-
-        /* Test all long options for either exact match
-           or abbreviated matches.  */
-        for (p = longopts, option_index = 0; p->name; p++, option_index++)
-            if (!strncmp (p->name, nextchar, nameend - nextchar))
-            {
-                if ((unsigned int) (nameend - nextchar)
-                        == (unsigned int) strlen (p->name))
-                {
-                    /* Exact match found.  */
-                    pfound = p;
-                    indfound = option_index;
-                    exact = 1;
-                    break;
-                }
-                else if (pfound == NULL)
-                {
-                    /* First nonexact match found.  */
-                    pfound = p;
-                    indfound = option_index;
-                }
-                else if (long_only
-                         || pfound->has_arg != p->has_arg
-                         || pfound->flag != p->flag
-                         || pfound->val != p->val)
-                    /* Second or later nonexact match found.  */
-                    ambig = 1;
-            }
-
-        if (ambig && !exact)
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-
-                __asprintf (&buf, _("%s: option `%s' is ambiguous\n"),
-                            argv[0], argv[optind]);
-
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#else
-                fprintf (stderr, _("%s: option `%s' is ambiguous\n"),
-                         argv[0], argv[optind]);
-#endif
-            }
-            nextchar += strlen (nextchar);
-            optind++;
-            optopt = 0;
-            return '?';
-        }
-
-        if (pfound != NULL)
-        {
-            option_index = indfound;
-            optind++;
-            if (*nameend)
-            {
-                /* Don't test has_arg with >, because some C compilers don't
-                   allow it to be used on enums.  */
-                if (pfound->has_arg)
-                    optarg = nameend + 1;
-                else
-                {
-                    if (print_errors)
-                    {
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-#endif
-
-                        if (argv[optind - 1][1] == '-')
-                        {
-                            /* --option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                            __asprintf (&buf, _("\
-%s: option `--%s' doesn't allow an argument\n"),
-                                        argv[0], pfound->name);
-#else
-                            fprintf (stderr, _("\
-%s: option `--%s' doesn't allow an argument\n"),
-                                     argv[0], pfound->name);
-#endif
-                        }
-                        else
-                        {
-                            /* +option or -option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                            __asprintf (&buf, _("\
-%s: option `%c%s' doesn't allow an argument\n"),
-                                        argv[0], argv[optind - 1][0],
-                                        pfound->name);
-#else
-                            fprintf (stderr, _("\
-%s: option `%c%s' doesn't allow an argument\n"),
-                                     argv[0], argv[optind - 1][0], pfound->name);
-#endif
-                        }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#endif
-                    }
-
-                    nextchar += strlen (nextchar);
-
-                    optopt = pfound->val;
-                    return '?';
-                }
-            }
-            else if (pfound->has_arg == 1)
-            {
-                if (optind < argc)
-                    optarg = argv[optind++];
-                else
-                {
-                    if (print_errors)
-                    {
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-
-                        __asprintf (&buf,
-                                    _("%s: option `%s' requires an argument\n"),
-                                    argv[0], argv[optind - 1]);
-
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#else
-                        fprintf (stderr,
-                                 _("%s: option `%s' requires an argument\n"),
-                                 argv[0], argv[optind - 1]);
-#endif
-                    }
-                    nextchar += strlen (nextchar);
-                    optopt = pfound->val;
-                    return optstring[0] == ':' ? ':' : '?';
-                }
-            }
-            nextchar += strlen (nextchar);
-            if (longind != NULL)
-                *longind = option_index;
-            if (pfound->flag)
-            {
-                *(pfound->flag) = pfound->val;
-                return 0;
-            }
-            return pfound->val;
-        }
-
-        /* Can't find it as a long option.  If this is not getopt_long_only,
-           or the option starts with '--' or is not a valid short
-           option, then it's an error.
-           Otherwise interpret it as a short option.  */
-        if (!long_only || argv[optind][1] == '-'
-                || my_index (optstring, *nextchar) == NULL)
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-#endif
-
-                if (argv[optind][1] == '-')
-                {
-                    /* --option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: unrecognized option `--%s'\n"),
-                                argv[0], nextchar);
-#else
-                    fprintf (stderr, _("%s: unrecognized option `--%s'\n"),
-                             argv[0], nextchar);
-#endif
-                }
-                else
-                {
-                    /* +option or -option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"),
-                                argv[0], argv[optind][0], nextchar);
-#else
-                    fprintf (stderr, _("%s: unrecognized option `%c%s'\n"),
-                             argv[0], argv[optind][0], nextchar);
-#endif
-                }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#endif
-            }
-            nextchar = (char *) "";
-            optind++;
-            optopt = 0;
-            return '?';
-        }
-    }
-
-    /* Look at and handle the next short option-character.  */
-
-    {
-        char c = *nextchar++;
-        char *temp = my_index (optstring, c);
-
-        /* Increment `optind' when we start to process its last character.  */
-        if (*nextchar == '\0')
-            ++optind;
-
-        if (temp == NULL || c == ':')
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-#endif
-
-                if (posixly_correct)
-                {
-                    /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: illegal option -- %c\n"),
-                                argv[0], c);
-#else
-                    fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c);
-#endif
-                }
-                else
-                {
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: invalid option -- %c\n"),
-                                argv[0], c);
-#else
-                    fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c);
-#endif
-                }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#endif
-            }
-            optopt = c;
-            return '?';
-        }
-        /* Convenience. Treat POSIX -W foo same as long option --foo */
-        if (temp[0] == 'W' && temp[1] == ';')
-        {
-            char *nameend;
-            const struct option *p;
-            const struct option *pfound = NULL;
-            int exact = 0;
-            int ambig = 0;
-            int indfound = 0;
-            int option_index;
-
-            /* This is an option that requires an argument.  */
-            if (*nextchar != '\0')
-            {
-                optarg = nextchar;
-                /* If we end this ARGV-element by taking the rest as an arg,
-                   we must advance to the next element now.  */
-                optind++;
-            }
-            else if (optind == argc)
-            {
-                if (print_errors)
-                {
-                    /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    char *buf;
-
-                    __asprintf (&buf, _("%s: option requires an argument -- %c\n"),
-                                argv[0], c);
-
-                    if (_IO_fwide (stderr, 0) > 0)
-                        __fwprintf (stderr, L"%s", buf);
-                    else
-                        fputs (buf, stderr);
-
-                    free (buf);
-#else
-                    fprintf (stderr, _("%s: option requires an argument -- %c\n"),
-                             argv[0], c);
-#endif
-                }
-                optopt = c;
-                if (optstring[0] == ':')
-                    c = ':';
-                else
-                    c = '?';
-                return c;
-            }
-            else
-                /* We already incremented `optind' once;
-                   increment it again when taking next ARGV-elt as argument.  */
-                optarg = argv[optind++];
-
-            /* optarg is now the argument, see if it's in the
-               table of longopts.  */
-
-            for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++)
-                /* Do nothing.  */ ;
-
-            /* Test all long options for either exact match
-               or abbreviated matches.  */
-            for (p = longopts, option_index = 0; p->name; p++, option_index++)
-                if (!strncmp (p->name, nextchar, nameend - nextchar))
-                {
-                    if ((unsigned int) (nameend - nextchar) == strlen (p->name))
-                    {
-                        /* Exact match found.  */
-                        pfound = p;
-                        indfound = option_index;
-                        exact = 1;
-                        break;
-                    }
-                    else if (pfound == NULL)
-                    {
-                        /* First nonexact match found.  */
-                        pfound = p;
-                        indfound = option_index;
-                    }
-                    else
-                        /* Second or later nonexact match found.  */
-                        ambig = 1;
-                }
-            if (ambig && !exact)
-            {
-                if (print_errors)
-                {
-#if defined _LIBC && defined USE_IN_LIBIO
-                    char *buf;
-
-                    __asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"),
-                                argv[0], argv[optind]);
-
-                    if (_IO_fwide (stderr, 0) > 0)
-                        __fwprintf (stderr, L"%s", buf);
-                    else
-                        fputs (buf, stderr);
-
-                    free (buf);
-#else
-                    fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"),
-                             argv[0], argv[optind]);
-#endif
-                }
-                nextchar += strlen (nextchar);
-                optind++;
-                return '?';
-            }
-            if (pfound != NULL)
-            {
-                option_index = indfound;
-                if (*nameend)
-                {
-                    /* Don't test has_arg with >, because some C compilers don't
-                       allow it to be used on enums.  */
-                    if (pfound->has_arg)
-                        optarg = nameend + 1;
-                    else
-                    {
-                        if (print_errors)
-                        {
-#if defined _LIBC && defined USE_IN_LIBIO
-                            char *buf;
-
-                            __asprintf (&buf, _("\
-%s: option `-W %s' doesn't allow an argument\n"),
-                                        argv[0], pfound->name);
-
-                            if (_IO_fwide (stderr, 0) > 0)
-                                __fwprintf (stderr, L"%s", buf);
-                            else
-                                fputs (buf, stderr);
-
-                            free (buf);
-#else
-                            fprintf (stderr, _("\
-%s: option `-W %s' doesn't allow an argument\n"),
-                                     argv[0], pfound->name);
-#endif
-                        }
-
-                        nextchar += strlen (nextchar);
-                        return '?';
-                    }
-                }
-                else if (pfound->has_arg == 1)
-                {
-                    if (optind < argc)
-                        optarg = argv[optind++];
-                    else
-                    {
-                        if (print_errors)
-                        {
-#if defined _LIBC && defined USE_IN_LIBIO
-                            char *buf;
-
-                            __asprintf (&buf, _("\
-%s: option `%s' requires an argument\n"),
-                                        argv[0], argv[optind - 1]);
-
-                            if (_IO_fwide (stderr, 0) > 0)
-                                __fwprintf (stderr, L"%s", buf);
-                            else
-                                fputs (buf, stderr);
-
-                            free (buf);
-#else
-                            fprintf (stderr,
-                                     _("%s: option `%s' requires an argument\n"),
-                                     argv[0], argv[optind - 1]);
-#endif
-                        }
-                        nextchar += strlen (nextchar);
-                        return optstring[0] == ':' ? ':' : '?';
-                    }
-                }
-                nextchar += strlen (nextchar);
-                if (longind != NULL)
-                    *longind = option_index;
-                if (pfound->flag)
-                {
-                    *(pfound->flag) = pfound->val;
-                    return 0;
-                }
-                return pfound->val;
-            }
-            nextchar = NULL;
-            return 'W';   /* Let the application handle it.   */
-        }
-        if (temp[1] == ':')
-        {
-            if (temp[2] == ':')
-            {
-                /* This is an option that accepts an argument optionally.  */
-                if (*nextchar != '\0')
-                {
-                    optarg = nextchar;
-                    optind++;
-                }
-                else
-                    optarg = NULL;
-                nextchar = NULL;
-            }
-            else
-            {
-                /* This is an option that requires an argument.  */
-                if (*nextchar != '\0')
-                {
-                    optarg = nextchar;
-                    /* If we end this ARGV-element by taking the rest as an arg,
-                       we must advance to the next element now.  */
-                    optind++;
-                }
-                else if (optind == argc)
-                {
-                    if (print_errors)
-                    {
-                        /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-
-                        __asprintf (&buf,
-                                    _("%s: option requires an argument -- %c\n"),
-                                    argv[0], c);
-
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#else
-                        fprintf (stderr,
-                                 _("%s: option requires an argument -- %c\n"),
-                                 argv[0], c);
-#endif
-                    }
-                    optopt = c;
-                    if (optstring[0] == ':')
-                        c = ':';
-                    else
-                        c = '?';
-                }
-                else
-                    /* We already incremented `optind' once;
-                       increment it again when taking next ARGV-elt as argument.  */
-                    optarg = argv[optind++];
-                nextchar = NULL;
-            }
-        }
-        return c;
-    }
-}
-
-int
-getopt (argc, argv, optstring)
-int argc;
-char *const *argv;
-const char *optstring;
-{
-    return _getopt_internal (argc, argv, optstring,
-                             (const struct option *) 0,
-                             (int *) 0,
-                             0);
-}
-
-#endif  /* Not ELIDE_CODE.  */
-
-
-/* Compile with -DTEST to make an executable for use in testing
-   the above definition of `getopt'.  */
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.h b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.h
deleted file mode 100644
index 943432ce30..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/getopt.h
+++ /dev/null
@@ -1,191 +0,0 @@
-
-
-/* getopt.h */
-/* Declarations for getopt.
-   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
-   Foundation, Inc. This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute
-   it and/or modify it under the terms of the GNU Lesser
-   General Public License as published by the Free Software
-   Foundation; either version 2.1 of the License, or
-   (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will
-   be useful, but WITHOUT ANY WARRANTY; without even the
-   implied warranty of MERCHANTABILITY or FITNESS FOR A
-   PARTICULAR PURPOSE.  See the GNU Lesser General Public
-   License for more details.
-
-   You should have received a copy of the GNU Lesser General
-   Public License along with the GNU C Library; if not, write
-   to the Free Software Foundation, Inc., 59 Temple Place,
-   Suite 330, Boston, MA 02111-1307 USA.  */
-
-
-
-
-
-#ifndef _GETOPT_H
-
-#ifndef __need_getopt
-# define _GETOPT_H 1
-#endif
-
-/* If __GNU_LIBRARY__ is not already defined, either we are being used
-   standalone, or this is the first header included in the source file.
-   If we are being used with glibc, we need to include <features.h>, but
-   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
-   not defined, include <ctype.h>, which will pull in <features.h> for us
-   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
-   doesn't flood the namespace with stuff the way some other headers do.)  */
-#if !defined __GNU_LIBRARY__
-# include <ctype.h>
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-extern char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-extern int optind;
-
-/* Callers store zero here to inhibit the error message `getopt' prints
-   for unrecognized options.  */
-
-extern int opterr;
-
-/* Set to an option character which was unrecognized.  */
-
-extern int optopt;
-
-#ifndef __need_getopt
-/* Describe the long-named options requested by the application.
-   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
-   of `struct option' terminated by an element containing a name which is
-   zero.
-
-   The field `has_arg' is:
-   no_argument          (or 0) if the option does not take an argument,
-   required_argument    (or 1) if the option requires an argument,
-   optional_argument    (or 2) if the option takes an optional argument.
-
-   If the field `flag' is not NULL, it points to a variable that is set
-   to the value given in the field `val' when the option is found, but
-   left unchanged if the option is not found.
-
-   To have a long-named option do something other than set an `int' to
-   a compiled-in constant, such as set a value from `optarg', set the
-   option's `flag' field to zero and its `val' field to a nonzero
-   value (the equivalent single-letter option character, if there is
-   one).  For long options that have a zero `flag' field, `getopt'
-   returns the contents of the `val' field.  */
-
-struct option
-{
-# if (defined __STDC__ && __STDC__) || defined __cplusplus
-    const char *name;
-# else
-    char *name;
-# endif
-    /* has_arg can't be an enum because some compilers complain about
-       type mismatches in all the code that assumes it is an int.  */
-    int has_arg;
-    int *flag;
-    int val;
-};
-
-/* Names for the values of the `has_arg' field of `struct option'.  */
-
-# define no_argument            0
-# define required_argument      1
-# define optional_argument      2
-#endif  /* need getopt */
-
-
-/* Get definitions and prototypes for functions to process the
-   arguments in ARGV (ARGC of them, minus the program name) for
-   options given in OPTS.
-
-   Return the option character from OPTS just read.  Return -1 when
-   there are no more options.  For unrecognized options, or options
-   missing arguments, `optopt' is set to the option letter, and '?' is
-   returned.
-
-   The OPTS string is a list of characters which are recognized option
-   letters, optionally followed by colons, specifying that that letter
-   takes an argument, to be placed in `optarg'.
-
-   If a letter in OPTS is followed by two colons, its argument is
-   optional.  This behavior is specific to the GNU `getopt'.
-
-   The argument `--' causes premature termination of argument
-   scanning, explicitly telling `getopt' that there are no more
-   options.
-
-   If OPTS begins with `--', then non-option arguments are treated as
-   arguments to the option '\0'.  This behavior is specific to the GNU
-   `getopt'.  */
-
-#if (defined __STDC__ && __STDC__) || defined __cplusplus
-# ifdef __GNU_LIBRARY__
-/* Many other libraries have conflicting prototypes for getopt, with
-   differences in the consts, in stdlib.h.  To avoid compilation
-   errors, only prototype getopt for the GNU C library.  */
-extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
-# else /* not __GNU_LIBRARY__ */
-extern int getopt ();
-# endif /* __GNU_LIBRARY__ */
-
-# ifndef __need_getopt
-extern int getopt_long (int ___argc, char *const *___argv,
-                        const char *__shortopts,
-                        const struct option *__longopts, int *__longind);
-extern int getopt_long_only (int ___argc, char *const *___argv,
-                             const char *__shortopts,
-                             const struct option *__longopts, int *__longind);
-
-/* Internal only.  Users should not call this directly.  */
-extern int _getopt_internal (int ___argc, char *const *___argv,
-                             const char *__shortopts,
-                             const struct option *__longopts, int *__longind,
-                             int __long_only);
-# endif
-#else /* not __STDC__ */
-extern int getopt ();
-# ifndef __need_getopt
-extern int getopt_long ();
-extern int getopt_long_only ();
-
-extern int _getopt_internal ();
-# endif
-#endif /* __STDC__ */
-
-#ifdef  __cplusplus
-}
-#endif
-
-/* Make sure we later can get all the definitions and declarations.  */
-#undef __need_getopt
-
-#endif /* getopt.h */
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cl b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cl
deleted file mode 100644
index 88dd2c2e21..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-__kernel void
-kmeans_kernel_c(__global float  *feature,
-                __global float  *clusters,
-                __global int    *membership,
-                int     npoints,
-                int     nclusters,
-                int     nfeatures,
-                int		offset,
-                int		size
-               )
-{
-    unsigned int point_id = get_global_id(0);
-    int index = 0;
-    //const unsigned int point_id = get_global_id(0);
-    if (point_id < npoints)
-    {
-        float min_dist=FLT_MAX;
-        for (int i=0; i < nclusters; i++) {
-
-            float dist = 0;
-            float ans  = 0;
-            for (int l=0; l<nfeatures; l++) {
-                ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
-                       (feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
-            }
-
-            dist = ans;
-            if (dist < min_dist) {
-                min_dist = dist;
-                index    = i;
-
-            }
-        }
-        //printf("%d\n", index);
-        membership[point_id] = index;
-    }
-
-    return;
-}
-
-__kernel void
-kmeans_swap(__global float  *feature,
-            __global float  *feature_swap,
-            int     npoints,
-            int     nfeatures
-           ) {
-
-    unsigned int tid = get_global_id(0);
-    for(int i = 0; i <  nfeatures; i++)
-        feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
-
-}
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cpp b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cpp
deleted file mode 100644
index bf23045894..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <iostream>
-#include <string>
-#include <parboil.h>
-#include "kmeans.h"
-
-#ifdef WIN
-#include <windows.h>
-#else
-#include <pthread.h>
-#include <sys/time.h>
-double gettime() {
-    struct timeval t;
-    gettimeofday(&t,NULL);
-    return t.tv_sec+t.tv_usec*1e-6;
-}
-#endif
-
-
-#ifdef NV
-#include <oclUtils.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-#ifdef RD_WG_SIZE_0_0
-#define BLOCK_SIZE RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define BLOCK_SIZE RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE RD_WG_SIZE
-#else
-#define BLOCK_SIZE 256
-#endif
-
-#ifdef RD_WG_SIZE_1_0
-#define BLOCK_SIZE2 RD_WG_SIZE_1_0
-#elif defined(RD_WG_SIZE_1)
-#define BLOCK_SIZE2 RD_WG_SIZE_1
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE2 RD_WG_SIZE
-#else
-#define BLOCK_SIZE2 256
-#endif
-
-
-
-// local variables
-static cl_context	    context;
-static cl_command_queue cmd_queue;
-static cl_device_type   device_type;
-static cl_device_id   * device_list;
-static cl_int           num_devices;
-
-static struct pb_TimerSet timers;
-
-static int initialize(int use_gpu)
-{
-    cl_int result;
-    size_t size;
-
-    // create OpenCL context
-    cl_platform_id platform_id;
-    if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
-        printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
-        return -1;
-    }
-    cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
-    device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
-    context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
-    if( !context ) {
-        printf("ERROR: clCreateContextFromType(%s) failed\n", use_gpu ? "GPU" : "CPU");
-        return -1;
-    }
-
-    // get the list of GPUs
-    result = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &size );
-    num_devices = (int) (size / sizeof(cl_device_id));
-
-    if( result != CL_SUCCESS || num_devices < 1 ) {
-        printf("ERROR: clGetContextInfo() failed\n");
-        return -1;
-    }
-    device_list = new cl_device_id[num_devices];
-    if( !device_list ) {
-        printf("ERROR: new cl_device_id[] failed\n");
-        return -1;
-    }
-    result = clGetContextInfo( context, CL_CONTEXT_DEVICES, size, device_list, NULL );
-    if( result != CL_SUCCESS ) {
-        printf("ERROR: clGetContextInfo() failed\n");
-        return -1;
-    }
-
-    // create command queue for the first device
-    cmd_queue = clCreateCommandQueue( context, device_list[0], CL_QUEUE_PROFILING_ENABLE, NULL );
-    if( !cmd_queue ) {
-        printf("ERROR: clCreateCommandQueue() failed\n");
-        return -1;
-    }
-
-    return 0;
-}
-
-static int shutdown()
-{
-    // release resources
-    if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
-    if( context ) clReleaseContext( context );
-    if( device_list ) delete device_list;
-
-    // reset all variables
-    cmd_queue = 0;
-    context = 0;
-    device_list = 0;
-    num_devices = 0;
-    device_type = 0;
-
-    return 0;
-}
-
-cl_mem d_feature;
-cl_mem d_feature_swap;
-cl_mem d_cluster;
-cl_mem d_membership;
-
-cl_kernel kernel;
-cl_kernel kernel_s;
-cl_kernel kernel2;
-
-int   *membership_OCL;
-int   *membership_d;
-float *feature_d;
-float *clusters_d;
-float *center_d;
-
-int allocate(int n_points, int n_features, int n_clusters, float **feature)
-{
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    int sourcesize = 1024*1024;
-    char * source = (char *)calloc(sourcesize, sizeof(char));
-    if(!source) {
-        printf("ERROR: calloc(%d) failed\n", sourcesize);
-        return -1;
-    }
-
-    // read the kernel core source
-    char * tempchar = "src/opencl/kmeans.cl";
-    FILE * fp = fopen(tempchar, "rb");
-    if(!fp) {
-        printf("ERROR: unable to open '%s'\n", tempchar);
-        return -1;
-    }
-    fread(source + strlen(source), sourcesize, 1, fp);
-    fclose(fp);
-
-    // OpenCL initialization
-    int use_gpu = 1;
-    if(initialize(use_gpu)) return -1;
-    pb_SetOpenCL(&context, &cmd_queue);
-
-    // compile kernel
-    cl_int err = 0;
-    const char * slist[2] = { source, 0 };
-    cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateProgramWithSource() => %d\n", err);
-        return -1;
-    }
-    err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
-    {   // show warnings/errors
-        //	static char log[65536]; memset(log, 0, sizeof(log));
-        //	cl_device_id device_id = 0;
-        //	err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
-        //	clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
-        //	if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
-    }
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clBuildProgram() => %d\n", err);
-        return -1;
-    }
-
-    char * kernel_kmeans_c  = "kmeans_kernel_c";
-    char * kernel_swap  = "kmeans_swap";
-
-    kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateKernel() 0 => %d\n", err);
-        return -1;
-    }
-    kernel2 = clCreateKernel(prog, kernel_swap, &err);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateKernel() 0 => %d\n", err);
-        return -1;
-    }
-
-    clReleaseProgram(prog);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err );
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n", n_points * n_features, err);
-        return -1;
-    }
-    d_feature_swap = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * n_features * sizeof(float), NULL, &err );
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n", n_points * n_features, err);
-        return -1;
-    }
-    d_cluster = clCreateBuffer(context, CL_MEM_READ_WRITE, n_clusters * n_features  * sizeof(float), NULL, &err );
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n", n_clusters * n_features, err);
-        return -1;
-    }
-    d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE, n_points * sizeof(int), NULL, &err );
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points, err);
-        return -1;
-    }
-
-    //write buffers
-    err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0, n_points * n_features * sizeof(float), feature[0], 0, 0, 0);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n", n_points * n_features, err);
-        return -1;
-    }
-
-    pb_SwitchToTimer( &timers, visc_TimerID_SETUP );
-    clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &d_feature);
-    clSetKernelArg(kernel2, 1, sizeof(void *), (void*) &d_feature_swap);
-    clSetKernelArg(kernel2, 2, sizeof(cl_int), (void*) &n_points);
-    clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &n_features);
-
-    size_t global_work[3] = { n_points, 1, 1 };
-    /// Ke Wang adjustable local group size 2013/08/07 10:37:33
-    size_t local_work_size= BLOCK_SIZE; // work group size is defined by RD_WG_SIZE_0 or RD_WG_SIZE_0_0 2014/06/10 17:00:51
-    if(global_work[0]%local_work_size !=0)
-        global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
-
-    pb_SwitchToTimer( &timers, pb_TimerID_KERNEL);
-    //pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION);
-    err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work, &local_work_size, 0, 0, 0);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
-        return -1;
-    }
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    membership_OCL = (int*) malloc(n_points * sizeof(int));
-    return 0;
-}
-
-void deallocateMemory()
-{
-    pb_SwitchToTimer( &timers, visc_TimerID_SETUP );
-    clReleaseMemObject(d_feature);
-    clReleaseMemObject(d_feature_swap);
-    clReleaseMemObject(d_cluster);
-    clReleaseMemObject(d_membership);
-    free(membership_OCL);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-
-}
-
-
-int main( int argc, char** argv)
-{
-    printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n", BLOCK_SIZE, BLOCK_SIZE2);
-
-    setup(argc, argv);
-    shutdown();
-}
-
-int kmeansOCL(float **feature,    /* in: [npoints][nfeatures] */
-              int     n_features,
-              int     n_points,
-              int     n_clusters,
-              int    *membership,
-              float **clusters,
-              int     *new_centers_len,
-              float  **new_centers)
-{
-    int delta = 0;
-    int i, j, k;
-    cl_int err = 0;
-
-    size_t global_work[3] = { n_points, 1, 1 };
-
-    /// Ke Wang adjustable local group size 2013/08/07 10:37:33
-    size_t local_work_size=BLOCK_SIZE2; // work group size is defined by RD_WG_SIZE_1 or RD_WG_SIZE_1_0 2014/06/10 17:00:41
-    if(global_work[0]%local_work_size !=0)
-        global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY);
-    err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0, n_clusters * n_features * sizeof(float), clusters[0], 0, 0, 0);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points, err);
-        return -1;
-    }
-
-    int size = 0;
-    int offset = 0;
-
-    pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
-    clSetKernelArg(kernel_s, 0, sizeof(void *), (void*) &d_feature_swap);
-    clSetKernelArg(kernel_s, 1, sizeof(void *), (void*) &d_cluster);
-    clSetKernelArg(kernel_s, 2, sizeof(void *), (void*) &d_membership);
-    clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void*) &n_points);
-    clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void*) &n_clusters);
-    clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void*) &n_features);
-    clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void*) &offset);
-    clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void*) &size);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_KERNEL);
-    //pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION);
-    err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work, &local_work_size, 0, 0, 0);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
-        return -1;
-    }
-    clFinish(cmd_queue);
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY);
-    err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0, n_points * sizeof(int), membership_OCL, 0, 0, 0);
-    if(err != CL_SUCCESS) {
-        printf("ERROR: Memcopy Out\n");
-        return -1;
-    }
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE);
-    delta = 0;
-    for (i = 0; i < n_points; i++)
-    {
-        int cluster_id = membership_OCL[i];
-        new_centers_len[cluster_id]++;
-        if (membership_OCL[i] != membership[i])
-        {
-            delta++;
-            membership[i] = membership_OCL[i];
-        }
-        for (j = 0; j < n_features; j++)
-        {
-            new_centers[cluster_id][j] += feature[i][j];
-        }
-    }
-
-    return delta;
-}
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.h b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.h
deleted file mode 100644
index 0397992475..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-#ifndef _H_FUZZY_KMEANS
-#define _H_FUZZY_KMEANS
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-/* rmse.c */
-float   euclid_dist_2        (float*, float*, int);
-int     find_nearest_point   (float* , int, float**, int);
-float	rms_err(float**, int, int, float**, int);
-int     cluster(int, int, float**, int, int, float, int*, float***, float*, int, int);
-#ifdef __cplusplus
-extern "C" {
-#endif
-  int setup(int argc, char** argv);
-  int allocate(int npoints, int nfeatures, int nclusters, float **feature);
-  void deallocateMemory();
-  int kmeansOCL(float **feature, int nfeatures, int npoints, int nclusters, int *membership, float **clusters, int *new_centers_len, float  **new_centers);
-#ifdef __cplusplus
-}
-#endif
-float** kmeans_clustering(float **feature, int nfeatures, int npoints, int nclusters, float threshold, int *membership);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans_clustering.c b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans_clustering.c
deleted file mode 100644
index c9e28e6d86..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/kmeans_clustering.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         kmeans_clustering.c                                 **/
-/**   Description:  Implementation of regular k-means clustering        **/
-/**                 algorithm                                           **/
-/**   Author:  Wei-keng Liao                                            **/
-/**            ECE Department, Northwestern University                  **/
-/**            email: wkliao@ece.northwestern.edu                       **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath                                          **/
-/**              Northwestern University.                               **/
-/**                                                                     **/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <math.h>
-
-#include "kmeans.h"
-
-#define RANDOM_MAX 2147483647
-
-extern double wtime(void);
-
-/*----< kmeans_clustering() >---------------------------------------------*/
-float** kmeans_clustering(float **feature,    /* in: [npoints][nfeatures] */
-                          int     nfeatures,
-                          int     npoints,
-                          int     nclusters,
-                          float   threshold,
-                          int    *membership) /* out: [npoints] */
-{
-    int      i, j, n = 0;				/* counters */
-    int		 loop=0, temp;
-    int     *new_centers_len;	/* [nclusters]: no. of points in each cluster */
-    float    delta;				/* if the point moved */
-    float  **clusters;			/* out: [nclusters][nfeatures] */
-    float  **new_centers;		/* [nclusters][nfeatures] */
-
-    int     *initial;			/* used to hold the index of points not yet selected
-								   prevents the "birthday problem" of dual selection (?)
-								   considered holding initial cluster indices, but changed due to
-								   possible, though unlikely, infinite loops */
-    int      initial_points;
-    int		 c = 0;
-
-    /* nclusters should never be > npoints
-       that would guarantee a cluster without points */
-    if (nclusters > npoints)
-        nclusters = npoints;
-
-    /* allocate space for and initialize returning variable clusters[] */
-    clusters    = (float**) malloc(nclusters *             sizeof(float*));
-    clusters[0] = (float*)  malloc(nclusters * nfeatures * sizeof(float));
-    for (i=1; i<nclusters; i++)
-        clusters[i] = clusters[i-1] + nfeatures;
-
-    /* initialize the random clusters */
-    initial = (int *) malloc (npoints * sizeof(int));
-    for (i = 0; i < npoints; i++)
-    {
-        initial[i] = i;
-    }
-    initial_points = npoints;
-
-    /* randomly pick cluster centers */
-    for (i=0; i<nclusters && initial_points >= 0; i++) {
-        //n = (int)rand() % initial_points;
-
-        for (j=0; j<nfeatures; j++)
-            clusters[i][j] = feature[initial[n]][j];	// remapped
-
-        /* swap the selected index to the end (not really necessary,
-           could just move the end up) */
-        temp = initial[n];
-        initial[n] = initial[initial_points-1];
-        initial[initial_points-1] = temp;
-        initial_points--;
-        n++;
-    }
-
-    /* initialize the membership to -1 for all */
-    for (i=0; i < npoints; i++)
-        membership[i] = -1;
-
-    /* allocate space for and initialize new_centers_len and new_centers */
-    new_centers_len = (int*) calloc(nclusters, sizeof(int));
-
-    new_centers    = (float**) malloc(nclusters *            sizeof(float*));
-    new_centers[0] = (float*)  calloc(nclusters * nfeatures, sizeof(float));
-    for (i=1; i<nclusters; i++)
-        new_centers[i] = new_centers[i-1] + nfeatures;
-
-    /* iterate until convergence */
-    do {
-        delta = 0.0;
-        // CUDA
-        delta = (float) kmeansOCL(feature,			/* in: [npoints][nfeatures] */
-                                  nfeatures,		/* number of attributes for each point */
-                                  npoints,			/* number of data points */
-                                  nclusters,		/* number of clusters */
-                                  membership,		/* which cluster the point belongs to */
-                                  clusters,		/* out: [nclusters][nfeatures] */
-                                  new_centers_len,	/* out: number of points in each cluster */
-                                  new_centers		/* sum of points in each cluster */
-                                 );
-
-        /* replace old cluster centers with new_centers */
-        /* CPU side of reduction */
-        for (i=0; i<nclusters; i++) {
-            for (j=0; j<nfeatures; j++) {
-                if (new_centers_len[i] > 0)
-                    clusters[i][j] = new_centers[i][j] / new_centers_len[i];	/* take average i.e. sum/n */
-                new_centers[i][j] = 0.0;	/* set back to 0 */
-            }
-            new_centers_len[i] = 0;			/* set back to 0 */
-        }
-        c++;
-    } while ((delta > threshold) && (loop++ < 500));	/* makes sure loop terminates */
-    printf("iterated %d times\n", c);
-    free(new_centers[0]);
-    free(new_centers);
-    free(new_centers_len);
-
-    return clusters;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/read_input.c b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/read_input.c
deleted file mode 100644
index 018eb77ccf..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/read_input.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         example.c                                           **/
-/**   Description:  Takes as input a file:                              **/
-/**                 ascii  file: containing 1 data point per line       **/
-/**                 binary file: first int is the number of objects     **/
-/**                              2nd int is the no. of features of each **/
-/**                              object                                 **/
-/**                 This example performs a fuzzy c-means clustering    **/
-/**                 on the data. Fuzzy clustering is performed using    **/
-/**                 min to max clusters and the clustering that gets    **/
-/**                 the best score according to a compactness and       **/
-/**                 separation criterion are returned.                  **/
-/**   Author:  Wei-keng Liao                                            **/
-/**            ECE Department Northwestern University                   **/
-/**            email: wkliao@ece.northwestern.edu                       **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath                                          **/
-/**              Northwestern University.                               **/
-/**                                                                     **/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-#define _CRT_SECURE_NO_DEPRECATE 1
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <math.h>
-#include <fcntl.h>
-
-#include "kmeans.h"
-#include <unistd.h>
-
-extern double wtime(void);
-
-
-
-/*---< usage() >------------------------------------------------------------*/
-void usage(char *argv0) {
-    char *help =
-        "\nUsage: %s [switches] -i filename\n\n"
-        "    -i filename      :file containing data to be clustered\n"
-        "    -m max_nclusters :maximum number of clusters allowed    [default=5]\n"
-        "    -n min_nclusters :minimum number of clusters allowed    [default=5]\n"
-        "    -t threshold     :threshold value                       [default=0.001]\n"
-        "    -l nloops        :iteration for each number of clusters [default=1]\n"
-        "    -b               :input file is in binary format\n"
-        "    -r               :calculate RMSE                        [default=off]\n"
-        "    -o               :output cluster center coordinates     [default=off]\n";
-    fprintf(stderr, help, argv0);
-    exit(-1);
-}
-
-/*---< main() >-------------------------------------------------------------*/
-int setup(int argc, char **argv) {
-    int		opt;
-    extern char   *optarg;
-    char   *filename = 0;
-    char* outfilename = 0;
-    float  *buf;
-    char	line[1024];
-    int		isBinaryFile = 0;
-
-    float	threshold = 0.001;		/* default value */
-    int		max_nclusters=5;		/* default value */
-    int		min_nclusters=5;		/* default value */
-    int		best_nclusters = 0;
-    int		nfeatures = 0;
-    int		npoints = 0;
-    float	len;
-
-    float **features;
-    float **cluster_centres=NULL;
-    int		i, j, index;
-    int		nloops = 1;				/* default value */
-
-    int		isRMSE = 0;
-    float	rmse;
-
-    int		isOutput = 0;
-    //float	cluster_timing, io_timing;
-
-    /* obtain command line arguments and change appropriate options */
-    while ( (opt=getopt(argc,argv,"i:t:m:n:l:o:br"))!= EOF) {
-        switch (opt) {
-        case 'i':
-            filename=optarg;
-            break;
-        case 'b':
-            isBinaryFile = 1;
-            break;
-        case 't':
-            threshold=atof(optarg);
-            break;
-        case 'm':
-            max_nclusters = atoi(optarg);
-            break;
-        case 'n':
-            min_nclusters = atoi(optarg);
-            break;
-        case 'r':
-            isRMSE = 1;
-            break;
-        case 'o':
-            isOutput = 1;
-            outfilename = optarg;
-            break;
-        case 'l':
-            nloops = atoi(optarg);
-            break;
-        case '?':
-            usage(argv[0]);
-            break;
-        default:
-            usage(argv[0]);
-            break;
-        }
-    }
-
-    if (filename == 0) usage(argv[0]);
-
-    /* ============== I/O begin ==============*/
-    /* get nfeatures and npoints */
-    //io_timing = omp_get_wtime();
-    if (isBinaryFile) {		//Binary file input
-        int infile;
-        if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
-            fprintf(stderr, "Error: no such file (%s)\n", filename);
-            exit(1);
-        }
-        read(infile, &npoints,   sizeof(int));
-        read(infile, &nfeatures, sizeof(int));
-
-        /* allocate space for features[][] and read attributes of all objects */
-        buf         = (float*) malloc(npoints*nfeatures*sizeof(float));
-        features    = (float**)malloc(npoints*          sizeof(float*));
-        features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
-        for (i=1; i<npoints; i++)
-            features[i] = features[i-1] + nfeatures;
-
-        read(infile, buf, npoints*nfeatures*sizeof(float));
-
-        close(infile);
-    }
-    else {
-        FILE *infile;
-        if ((infile = fopen(filename, "r")) == NULL) {
-            fprintf(stderr, "Error: no such file (%s)\n", filename);
-            exit(1);
-        }
-        while (fgets(line, 1024, infile) != NULL)
-            if (strtok(line, " \t\n") != 0)
-                npoints++;
-        rewind(infile);
-        while (fgets(line, 1024, infile) != NULL) {
-            if (strtok(line, " \t\n") != 0) {
-                /* ignore the id (first attribute): nfeatures = 1; */
-                while (strtok(NULL, " ,\t\n") != NULL) nfeatures++;
-                break;
-            }
-        }
-
-        /* allocate space for features[] and read attributes of all objects */
-        buf         = (float*) malloc(npoints*nfeatures*sizeof(float));
-        features    = (float**)malloc(npoints*          sizeof(float*));
-        features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
-        for (i=1; i<npoints; i++)
-            features[i] = features[i-1] + nfeatures;
-        rewind(infile);
-        i = 0;
-        while (fgets(line, 1024, infile) != NULL) {
-            if (strtok(line, " \t\n") == NULL) continue;
-            for (j=0; j<nfeatures; j++) {
-                buf[i] = atof(strtok(NULL, " ,\t\n"));
-                i++;
-            }
-        }
-        fclose(infile);
-    }
-    //io_timing = omp_get_wtime() - io_timing;
-
-    printf("\nI/O completed\n");
-    printf("\nNumber of objects: %d\n", npoints);
-    printf("Number of features: %d\n", nfeatures);
-    /* ============== I/O end ==============*/
-
-    // error check for clusters
-    if (npoints < min_nclusters)
-    {
-        printf("Error: min_nclusters(%d) > npoints(%d) -- cannot proceed\n", min_nclusters, npoints);
-        exit(0);
-    }
-
-    srand(7);												/* seed for future random number generator */
-    memcpy(features[0], buf, npoints*nfeatures*sizeof(float)); /* now features holds 2-dimensional array of features */
-    free(buf);
-
-    /* ======================= core of the clustering ===================*/
-
-    //cluster_timing = omp_get_wtime();		/* Total clustering time */
-    cluster_centres = NULL;
-    index = cluster(npoints,				/* number of data points */
-                    nfeatures,				/* number of features for each point */
-                    features,				/* array: [npoints][nfeatures] */
-                    min_nclusters,			/* range of min to max number of clusters */
-                    max_nclusters,
-                    threshold,				/* loop termination factor */
-                    &best_nclusters,			/* return: number between min and max */
-                    &cluster_centres,		/* return: [best_nclusters][nfeatures] */
-                    &rmse,					/* Root Mean Squared Error */
-                    isRMSE,					/* calculate RMSE */
-                    nloops);				/* number of iteration for each number of clusters */
-
-    //cluster_timing = omp_get_wtime() - cluster_timing;
-
-
-    /* =============== Command Line Output =============== */
-
-    /* cluster center coordinates
-       :displayed only for when k=1*/
-
-    //printf("Input file = %s\n", filename);
-    //printf("Output file = %s\n", outfilename);
-    if((min_nclusters == max_nclusters) && (isOutput == 1)) {
-        FILE *outfile;
-        if ((outfile = fopen(outfilename, "w")) == NULL) {
-            fprintf(stderr, "Error: no such file (%s)\n", outfilename);
-            exit(1);
-        }
-        fwrite(&max_nclusters, sizeof(int), 1, outfile);
-        fwrite(&nfeatures, sizeof(int), 1, outfile);
-        fwrite(&cluster_centres[0][0], sizeof(float), max_nclusters*nfeatures, outfile);
-        fclose(outfile);
-        //printf("\n================= Centroid Coordinates =================\n");
-        //for(i = 0; i < max_nclusters; i++) {
-            //printf("%d:", i);
-            //for(j = 0; j < nfeatures; j++) {
-                //printf(" %.2f", cluster_centres[i][j]);
-            //}
-            //printf("\n\n");
-        //}
-    }
-
-    len = (float) ((max_nclusters - min_nclusters + 1)*nloops);
-
-    //printf("Time for I/O: %.5fsec\n", io_timing);
-    //printf("Time for Entire Clustering: %.5fsec\n", cluster_timing);
-
-    if(min_nclusters != max_nclusters) {
-        if(nloops != 1) {									//range of k, multiple iteration
-            //printf("Average Clustering Time: %fsec\n",
-            //		cluster_timing / len);
-            printf("Best number of clusters is %d\n", best_nclusters);
-        }
-        else {												//range of k, single iteration
-            //printf("Average Clustering Time: %fsec\n",
-            //		cluster_timing / len);
-            printf("Best number of clusters is %d\n", best_nclusters);
-        }
-    }
-    else {
-        if(nloops != 1) {									// single k, multiple iteration
-            //printf("Average Clustering Time: %.5fsec\n",
-            //		cluster_timing / nloops);
-            if(isRMSE)										// if calculated RMSE
-                printf("Number of trials to approach the best RMSE of %.3f is %d\n", rmse, index + 1);
-        }
-        else {												// single k, single iteration
-            if(isRMSE)										// if calculated RMSE
-                printf("Root Mean Squared Error: %.3f\n", rmse);
-        }
-    }
-
-
-    /* free up memory */
-    free(features[0]);
-    free(features);
-    return(0);
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/rmse.c b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/rmse.c
deleted file mode 100644
index 0029c64dc7..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/rmse.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*************************************************************************/
-/**   File:         rmse.c												**/
-/**   Description:  calculate root mean squared error of particular     **/
-/**                 clustering.											**/
-/**   Author:  Sang-Ha Lee												**/
-/**            University of Virginia.									**/
-/**																		**/
-/**   Note: euclid_dist_2() and find_nearest_point() adopted from       **/
-/**			Minebench code.												**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <math.h>
-
-
-#include "kmeans.h"
-
-extern double wtime(void);
-
-/*----< euclid_dist_2() >----------------------------------------------------*/
-/* multi-dimensional spatial Euclid distance square */
-__inline
-float euclid_dist_2(float *pt1,
-                    float *pt2,
-                    int    numdims)
-{
-    int i;
-    float ans=0.0;
-
-    for (i=0; i<numdims; i++)
-        ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
-
-    return(ans);
-}
-
-/*----< find_nearest_point() >-----------------------------------------------*/
-__inline
-int find_nearest_point(float  *pt,          /* [nfeatures] */
-                       int     nfeatures,
-                       float  **pts,         /* [npts][nfeatures] */
-                       int     npts)
-{
-    int index, i;
-    float max_dist=FLT_MAX;
-
-    /* find the cluster center id with min distance to pt */
-    for (i=0; i<npts; i++) {
-        float dist;
-        dist = euclid_dist_2(pt, pts[i], nfeatures);  /* no need square root */
-        if (dist < max_dist) {
-            max_dist = dist;
-            index    = i;
-        }
-    }
-    return(index);
-}
-
-/*----< rms_err(): calculates RMSE of clustering >-------------------------------------*/
-float rms_err	(float **feature,         /* [npoints][nfeatures] */
-                 int     nfeatures,
-                 int     npoints,
-                 float **cluster_centres, /* [nclusters][nfeatures] */
-                 int     nclusters)
-{
-    int    i;
-    int	   nearest_cluster_index;	/* cluster center id with min distance to pt */
-    float  sum_euclid = 0.0;		/* sum of Euclidean distance squares */
-    float  ret;						/* return value */
-
-    /* calculate and sum the sqaure of euclidean distance*/
-    #pragma omp parallel for \
-    shared(feature,cluster_centres) \
-    firstprivate(npoints,nfeatures,nclusters) \
-    private(i, nearest_cluster_index) \
-    schedule (static)
-    for (i=0; i<npoints; i++) {
-        nearest_cluster_index = find_nearest_point(feature[i],
-                                nfeatures,
-                                cluster_centres,
-                                nclusters);
-
-        sum_euclid += euclid_dist_2(feature[i],
-                                    cluster_centres[nearest_cluster_index],
-                                    nfeatures);
-
-    }
-    /* divide by n, then take sqrt */
-    ret = sqrt(sum_euclid / npoints);
-
-    return(ret);
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/run b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/run
deleted file mode 100755
index 9fcc65554d..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/run
+++ /dev/null
@@ -1 +0,0 @@
-./kmeans -o -i ../../../../datasets/kmeans/kdd_cup 
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/unistd.h b/hpvm/test/parboil/benchmarks/kmeans/src/opencl/unistd.h
deleted file mode 100644
index ff334e56eb..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/opencl/unistd.h
+++ /dev/null
@@ -1,945 +0,0 @@
-/* IBM_PROLOG_BEGIN_TAG                                                   */
-/* This is an automatically generated prolog.                             */
-/*                                                                        */
-/* bos53H src/bos/usr/include/unistd.h 1.38.4.46                          */
-/*                                                                        */
-/* Licensed Materials - Property of IBM                                   */
-/*                                                                        */
-/* (C) COPYRIGHT International Business Machines Corp. 1985,1995          */
-/* All Rights Reserved                                                    */
-/*                                                                        */
-/* US Government Users Restricted Rights - Use, duplication or            */
-/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.      */
-/*                                                                        */
-/* IBM_PROLOG_END_TAG                                                     */
-/* @(#)82     1.38.4.46  src/bos/usr/include/unistd.h, incstd, bos53H, h2006_17B8 4/25/06 11:53:09 */
-/*
- * COMPONENT_NAME: (INCSTD) Standard Include Files
- *
- * FUNCTIONS:
- *
- * ORIGINS: 3 27
- *
- * (C) COPYRIGHT International Business Machines Corp. 1985, 2006
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- *
- * Copyright (c) 1984 AT&T
- * All Rights Reserved
- *
- * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T
- * The copyright notice above does not evidence any
- * actual or intended publication of such source code.
- */
-
-#ifndef _H_UNISTD
-#define _H_UNISTD
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _H_STANDARDS
-#include <standards.h>
-#endif
-
-#include <strict_stdtypes.h>
-
-#ifndef _H_TYPES
-#include <sys/types.h>
-#endif
-
-#include <end_strict_stdtypes.h>
-
-#ifndef _H_ACCESS
-#include <sys/access.h>	/* for the "access" function */
-#endif
-
-/*
- * POSIX requires that certain values be included in unistd.h.  It also
- * requires that when _POSIX_SOURCE is defined only those standard
- * specific values are present.  This header includes all the POSIX
- * required entries.
- */
-
-#ifdef _POSIX_SOURCE
-#ifdef _LARGE_FILES
-#define lseek lseek64
-#endif
-
-
-/* Symbolic constants for the "lseek" function: */
-#ifndef SEEK_SET
-#define SEEK_SET 0	/* Set file pointer to "offset" */
-#define SEEK_CUR 1	/* Set file pointer to current plus "offset" */
-#define SEEK_END 2	/* Set file pointer to EOF plus "offset" */
-#endif /* SEEK_SET */
-
-#ifdef _NO_PROTO
-
-#ifndef _KERNEL
-extern int access();
-extern unsigned int alarm();
-extern int chdir();
-extern int chown();
-extern int close();
-extern char *ctermid();
-extern int dup();
-extern int dup2();
-extern int execl();
-extern int execv();
-extern int execle();
-extern int execve();
-extern int execlp();
-extern int execvp();
-extern void _exit();
-extern pid_t fork();
-extern long fpathconf();
-extern char *getcwd();
-extern gid_t getegid();
-extern uid_t geteuid();
-extern gid_t getgid();
-extern int getgroups();
-extern char *getlogin();
-extern pid_t getpgrp();
-extern pid_t getpid();
-extern pid_t getppid();
-extern uid_t getuid();
-extern int isatty();
-extern int link();
-extern off_t lseek();
-extern long pathconf();
-extern int pause();
-extern int pipe();
-#if defined(_XOPEN_SOURCE) && ( _XOPEN_SOURCE >= 500 )
-extern int pthread_atfork();
-#endif
-extern int read();
-extern int rmdir();
-extern int setgid();
-extern int setpgid();
-extern int setsid();
-extern int setuid();
-extern unsigned int sleep();
-extern long sysconf();
-extern pid_t tcgetpgrp();
-extern int tcsetpgrp();
-extern char *ttyname();
-extern int unlink();
-extern int write();
-#endif		/* !_KERNEL	*/
-
-#else		/* POSIX required prototypes */
-
-#ifndef _KERNEL
-extern int access(const char *, int);
-extern unsigned int alarm(unsigned int);
-extern int chdir(const char *);
-extern int chown(const char *, uid_t, gid_t);
-extern int close(int);
-extern char *ctermid(char *);
-extern int dup(int);
-extern int dup2(int, int);
-extern int execl(const char *, const char *, ...);
-extern int execv(const char *, char *const []);
-extern int execle(const char *, const char *, ...);
-extern int execve(const char *, char *const [], char *const []);
-extern int execlp(const char *, const char *, ...);
-extern int execvp(const char *, char *const []);
-extern void _exit(int);
-extern pid_t fork(void);
-extern long fpathconf(int, int);
-extern char *getcwd(char *, size_t);
-extern gid_t getegid(void);
-extern uid_t geteuid(void);
-extern gid_t getgid(void);
-extern int getgroups(int, gid_t []);
-extern char *getlogin(void);
-#ifndef _BSD
-extern pid_t getpgrp(void);
-#endif /* _BSD */
-extern pid_t getpid(void);
-extern pid_t getppid(void);
-extern uid_t getuid(void);
-extern int isatty(int);
-extern int link(const char *, const char *);
-extern off_t lseek(int, off_t, int);
-#ifdef _LARGE_FILE_API
-extern off64_t	lseek64(int, off64_t, int);
-#endif
-extern long pathconf(const char *, int);
-extern int pause(void);
-extern int pipe(int []);
-#if defined(_XOPEN_SOURCE) && ( _XOPEN_SOURCE >= 500 )
-extern int pthread_atfork(void (*)(void), void (*)(void), void (*)(void));
-#endif
-extern ssize_t read(int, void *, size_t);
-extern int rmdir(const char *);
-extern int setgid(gid_t);
-extern int setpgid(pid_t, pid_t);
-extern pid_t setsid(void);
-extern int setuid(uid_t);
-extern unsigned int sleep(unsigned int);
-extern long sysconf(int);
-extern pid_t tcgetpgrp(int);
-extern int tcsetpgrp(int, pid_t);
-extern char *ttyname(int);
-extern int unlink(const char *);
-extern ssize_t write(int, const void *, size_t);
-#endif		/* !_KERNEL	*/
-#endif		/* !_NO_PROTO	*/
-
-#define STDIN_FILENO	0
-#define STDOUT_FILENO	1
-#define STDERR_FILENO	2
-
-#define _POSIX_JOB_CONTROL	1
-#define _POSIX_SAVED_IDS	1
-
-#define _POSIX_VERSION		200112L
-#define _POSIX2_VERSION		200112L
-#define _POSIX2_C_VERSION	200112L
-
-
-#ifdef _XOPEN_SOURCE
-
-#define _XOPEN_VERSION		600
-#define _XOPEN_XCU_VERSION	4
-#define _XOPEN_XPG3		1
-#define _XOPEN_XPG4		1
-#define _XOPEN_UNIX		1
-
-#define _XOPEN_REALTIME		(-1)
-#define _XOPEN_REALTIME_THREADS	(-1)
-
-#if (_XOPEN_SOURCE >= 600)
-#define _XOPEN_STREAMS		1
-#endif
-
-#define _XBS5_ILP32_OFF32	1
-#define _XBS5_ILP32_OFFBIG	1
-#define _XBS5_LP64_OFF64	1
-#define _XBS5_LPBIG_OFFBIG	1
-
-#define _POSIX2_C_BIND		200112L
-#define _POSIX2_C_DEV		200112L
-#define _POSIX2_CHAR_TERM	1
-#define _POSIX2_LOCALEDEF	200112L
-#define _POSIX2_UPE		200112L
-#define _POSIX2_FORT_DEV	(-1)
-#define _POSIX2_FORT_RUN	(-1)
-#define _POSIX2_SW_DEV		(-1)
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _POSIX_REGEXP         1
-#define _POSIX_SHELL          1
-#define _POSIX2_PBS           (-1)
-#define _POSIX2_PBS_ACCOUNTING        (-1)
-#define _POSIX2_PBS_CHECKPOINT        (-1)
-#define _POSIX2_PBS_LOCATE    (-1)
-#define _POSIX2_PBS_MESSAGE   (-1)
-#define _POSIX2_PBS_TRACK     (-1)
-#define _V6_ILP32_OFF32               1
-#define _V6_ILP32_OFFBIG      1
-#define _V6_LP64_OFF64                1
-#define _V6_LPBIG_OFFBIG      1
-
-#define _POSIX_ADVISORY_INFO   200112L
-#define _POSIX_BARRIERS        200112L
-#define _POSIX_CLOCK_SELECTION 200112L
-#define _POSIX_CPUTIME         200112L
-#define _POSIX_MONOTONIC_CLOCK 200112L
-
-#ifdef _POSIX_RAW_SOCKETS
-#undef _POSIX_RAW_SOCKETS
-#endif
-
-#define _POSIX_SPAWN           200112L
-#define _POSIX_SPIN_LOCKS      200112L
-#define _POSIX_SPORADIC_SERVER (-1)
-#define _POSIX_THREAD_CPUTIME  200112L
-#define _POSIX_THREAD_SPORADIC_SERVER (-1)
-#define _POSIX_TIMEOUTS	200112L
-#define _POSIX_TRACE           (-1)
-#define _POSIX_TRACE_EVENT_FILTER     (-1)
-#define _POSIX_TRACE_INHERIT   (-1)
-#define _POSIX_TRACE_LOG       (-1)
-#define _POSIX_TYPED_MEMORY_OBJECTS   (-1)
-
-#endif /* _POSIX_C_SOURCE >= 200112L */
-
-#define _XOPEN_CRYPT		1
-#define _XOPEN_SHM		1
-#define _XOPEN_ENH_I18N		1
-#define _XOPEN_LEGACY		(-1)
-#ifndef __64BIT__
-#define _UNIX_ABI		(-1)
-#define _UNIX_ABI_IA64		(-1)
-#define _UNIX_ABI_BIG_ENDIAN	(-1)
-#define _UNIX_ABI_LITTLE_ENDIAN	(-1)
-#endif /* __64BIT__ */
-
-extern  char    *optarg;
-extern  int     optind, opterr, optopt;
-
-#ifdef _NO_PROTO
-extern	size_t	confstr();
-extern  char    *crypt();
-extern  void    encrypt();
-extern  int     fsync();
-extern	int	getopt();
-extern	int	nice();
-extern  void    swab();
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern  char    *getpass();
-extern  int     chroot();
-#endif
-#else
-extern	size_t	confstr(int, char*, size_t);
-extern  char    *crypt(const char *, const char *);
-extern  void    encrypt(char *, int);
-extern  int     fsync(int);
-extern	int	getopt(int, char* const*, const char*);
-extern	int	nice(int);
-extern  void    swab(const void *, void *, ssize_t);
-extern int	fdatasync(int);
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern  char    *getpass(const char *);
-extern  int     chroot(const char *);
-#endif
-#endif
-
-#endif /* _XOPEN _SOURCE */
-
-/* Threads options for 1003.1c and XPG UNIX98 */
-#define _POSIX_THREADS				200112L
-#define _POSIX_THREAD_ATTR_STACKADDR            200112L
-#define _POSIX_THREAD_ATTR_STACKSIZE		200112L
-#define _POSIX_THREAD_PROCESS_SHARED		200112L
-#define _POSIX_THREAD_SAFE_FUNCTIONS		200112L
-#ifdef _ALL_SOURCE
-#define _POSIX_REENTRANT_FUNCTIONS		_POSIX_THREAD_SAFE_FUNCTIONS
-#endif
-
-/* Realtime threads options for 1003.1c and XPG UNIX98 */
-#define	 _POSIX_THREAD_PRIORITY_SCHEDULING	(-1)
-#define	 _POSIX_THREAD_PRIO_INHERIT		(-1)
-#define	 _POSIX_THREAD_PRIO_PROTECT		(-1)
-
-#undef  _POSIX_THREAD_FORKALL
-
-/* Realtime options for 1003.1c and XPG UNIX98 */
-#define _POSIX_ASYNCHRONOUS_IO			200112L
-#define _POSIX_FSYNC				200112L
-#define _POSIX_MAPPED_FILES			200112L
-#define _POSIX_MEMLOCK			        200112L
-#define _POSIX_MEMLOCK_RANGE		        200112L
-#define _POSIX_MEMORY_PROTECTION		200112L
-#define _POSIX_MESSAGE_PASSING			200112L
-#define _POSIX_PRIORITIZED_IO			200112L
-#define _POSIX_PRIORITY_SCHEDULING		200112L
-#define _POSIX_REALTIME_SIGNALS			200112L
-#define _POSIX_SEMAPHORES			200112L
-#define _POSIX_SHARED_MEMORY_OBJECTS            200112L
-#define _POSIX_SYNCHRONIZED_IO			200112L
-#define _POSIX_TIMERS				200112L
-
-#define _POSIX_ASYNC_IO				(-1)
-#undef	_POSIX_SYNC_IO
-#define _POSIX_PRIO_IO				(-1)
-
-#define _POSIX_CHOWN_RESTRICTED	 0
-#define _POSIX_VDISABLE		 0xFF
-#define _POSIX_NO_TRUNC		 0
-
-/* UNIX03 and POSIX01 */
-/* Always enabled */
-#define _POSIX_IPV6				200112L
-#define _POSIX_RAW_SOCKETS			200112L
-
-
-#ifndef NULL
-#define NULL	0
-#endif
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _POSIX_READER_WRITER_LOCKS            200112L
-#endif
-
-/* arguments for the confstr() function */
-
-#define _CS_PATH	1
-
-/* compile,link,lib,lint flags for 32bit, no_LARGE_FILES system */
-#define _CS_XBS5_ILP32_OFF32_CFLAGS	2
-#define _CS_XBS5_ILP32_OFF32_LDFLAGS	3
-#define _CS_XBS5_ILP32_OFF32_LIBS	4
-#define _CS_XBS5_ILP32_OFF32_LINTFLAGS	5
-
-/* compile,link,lib,lint flags for 32bit, _LARGE_FILES system */
-#define _CS_XBS5_ILP32_OFFBIG_CFLAGS	6
-#define _CS_XBS5_ILP32_OFFBIG_LDFLAGS	7
-#define _CS_XBS5_ILP32_OFFBIG_LIBS	8
-#define _CS_XBS5_ILP32_OFFBIG_LINTFLAGS	9
-
-/* compile,link,lib,lint flags for LP64 64bit system */
-#define _CS_XBS5_LP64_OFF64_CFLAGS	10
-#define _CS_XBS5_LP64_OFF64_LDFLAGS	11
-#define _CS_XBS5_LP64_OFF64_LIBS	12
-#define _CS_XBS5_LP64_OFF64_LINTFLAGS	13
-
-/* compile,link,lib,lint flags for ILP64 64bit system */
-/* AIX does not currently support this */
-#define _CS_XBS5_LPBIG_OFFBIG_CFLAGS	14
-#define _CS_XBS5_LPBIG_OFFBIG_LDFLAGS	15
-#define _CS_XBS5_LPBIG_OFFBIG_LIBS	16
-#define _CS_XBS5_LPBIG_OFFBIG_LINTFLAGS	17
-
-#define _CS_AIX_BOOTDEV				24
-#define _CS_AIX_MODEL_CODE			25
-#define _CS_AIX_ARCHITECTURE			26
-#define _CS_AIX_MODEL_CLASS			40
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _CS_POSIX_V6_ILP32_OFF32_CFLAGS		27
-#define _CS_POSIX_V6_ILP32_OFF32_LDFLAGS	28
-#define _CS_POSIX_V6_ILP32_OFF32_LIBS		29
-#define _CS_POSIX_V6_ILP32_OFFBIG_CFLAGS	30
-#define _CS_POSIX_V6_ILP32_OFFBIG_LDFLAGS	31
-#define _CS_POSIX_V6_ILP32_OFFBIG_LIBS		32
-#define _CS_POSIX_V6_LP64_OFF64_CFLAGS		33
-#define _CS_POSIX_V6_LP64_OFF64_LDFLAGS		34
-#define _CS_POSIX_V6_LP64_OFF64_LIBS		35
-#define _CS_POSIX_V6_LPBIG_OFFBIG_CFLAGS	36
-#define _CS_POSIX_V6_LPBIG_OFFBIG_LDFLAGS	37
-#define _CS_POSIX_V6_LPBIG_OFFBIG_LIBS		38
-#define _CS_POSIX_V6_WIDTH_RESTRICTED_ENVS      39
-#endif
-
-/* Values for the above */
-#define _CSPATH		"/usr/bin:/usr/vac/bin"
-
-/* ILP32_OFF32 */
-#define _CSPOSIX_V6_ILP32_OFF32_CFLAGS	"-q32"
-#define _CSXBS5_ILP32_OFF32_CFLAGS	_CSPOSIX_V6_ILP32_OFF32_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_ILP32_OFF32_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_ILP32_OFF32_LDFLAGS "-b32"
-#define _CSXBS5_ILP32_OFF32_LDFLAGS	_CSPOSIX_V6_ILP32_OFF32_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_ILP32_OFF32_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_ILP32_OFF32_LIBS	_CSPOSIX_V6_ILP32_OFF32_LIBS
-
-#define _CSXBS5_ILP32_OFF32_LINTFLAGS	""
-
-/* ILP32_OFFOFFBIG */
-#define _CSPOSIX_V6_ILP32_OFFBIG_CFLAGS "-q32 -D_LARGE_FILES -qlonglong"
-#define _CSXBS5_ILP32_OFFBIG_CFLAGS	_CSPOSIX_V6_ILP32_OFFBIG_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_ILP32_OFFBIG_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_ILP32_OFFBIG_LDFLAGS "-b32"
-#define _CSXBS5_ILP32_OFFBIG_LDFLAGS	_CSPOSIX_V6_ILP32_OFFBIG_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_ILP32_OFFBIG_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_ILP32_OFFBIG_LIBS	_CSPOSIX_V6_ILP32_OFFBIG_LIBS
-
-#define _CSXBS5_ILP32_OFFBIG_LINTFLAGS	"-D_LARGE_FILES -qlonglong"
-
-/* LP64_OFF64 */
-#define _CSPOSIX_V6_LP64_OFF64_CFLAGS	"-q64"
-#define _CSXBS5_LP64_OFF64_CFLAGS	_CSPOSIX_V6_LP64_OFF64_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_LP64_OFF64_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_LP64_OFF64_LDFLAGS	"-b64"
-#define _CSXBS5_LP64_OFF64_LDFLAGS	_CSPOSIX_V6_LP64_OFF64_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_LP64_OFF64_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_LP64_OFF64_LIBS		_CSPOSIX_V6_LP64_OFF64_LIBS
-
-#define _CSXBS5_LP64_OFF64_LINTFLAGS	"-D__64BIT__"
-
-/* LPBIG_OFFBIG */
-#define _CSPOSIX_V6_LPBIG_OFFBIG_CFLAGS "-q64"
-#define _CSXBS5_LPBIG_OFFBIG_CFLAGS	_CSPOSIX_V6_LPBIG_OFFBIG_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_LPBIG_OFFBIG_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_LPBIG_OFFBIG_LDFLAGS "-b64"
-#define _CSXBS5_LPBIG_OFFBIG_LDFLAGS	_CSPOSIX_V6_LPBIG_OFFBIG_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_LPBIG_OFFBIG_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_LPBIG_OFFBIG_LIBS	_CSPOSIX_V6_LPBIG_OFFBIG_LIBS
-
-#define _CSXBS5_LPBIG_OFFBIG_LINTFLAGS	"-D__64BIT__"
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _CSPOSIX_V6_WIDTH_RESTRICTED_ENVS \
-		"POSIX_V6_ILP32_OFF32\n"  \
-		"POSIX_V6_ILP32_OFFBIG\n" \
-		"POSIX_V6_LP64_OFF64\n"  \
-		"POSIX_V6_LPBIG_OFFBIG"
-#endif
-
-/* arguments for the pathconf() function */
-
-#define _PC_CHOWN_RESTRICTED	10
-#define _PC_LINK_MAX		11
-#define _PC_MAX_CANON		12
-#define _PC_MAX_INPUT		13
-#define _PC_NAME_MAX		14
-#define _PC_NO_TRUNC		15
-#define _PC_PATH_MAX		16
-#define _PC_PIPE_BUF		17
-#define _PC_VDISABLE		18
-#define _PC_ASYNC_IO		19
-#define _PC_SYNC_IO		20
-#define _PC_PRIO_IO		21
-#define _PC_FILESIZEBITS	22  /* # bits needed to hold offset */
-#define _PC_AIX_DISK_PARTITION	23
-#define _PC_AIX_DISK_SIZE	24
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _PC_SYMLINK_MAX         25
-#define _PC_ALLOC_SIZE_MIN      26
-#define _PC_REC_INCR_XFER_SIZE  27
-#define _PC_REC_MAX_XFER_SIZE   28
-#define _PC_REC_MIN_XFER_SIZE   29
-#define _PC_REC_XFER_ALIGN      30
-#define _PC_2_SYMLINKS          31
-#endif
-
-/* arguments for the sysconf() function, the defined numbers are used as
- * array index in sysconf().
- *
- * POSIX.1(1990), Table 4-2
- */
-#define _SC_ARG_MAX			0
-#define _SC_CHILD_MAX			1
-#define _SC_CLK_TCK			2
-#define _SC_NGROUPS_MAX			3
-#define _SC_OPEN_MAX			4
-#define _SC_STREAM_MAX			5
-#define _SC_TZNAME_MAX			6
-#define _SC_JOB_CONTROL			7
-#define _SC_SAVED_IDS			8
-#define _SC_VERSION			9
-
-/* POSIX.1(1990), Table 2-3, required by command getconf */
-
-#define _SC_POSIX_ARG_MAX		10
-#define _SC_POSIX_CHILD_MAX		11
-#define _SC_POSIX_LINK_MAX		12
-#define _SC_POSIX_MAX_CANON		13
-#define _SC_POSIX_MAX_INPUT		14
-#define _SC_POSIX_NAME_MAX		15
-#define _SC_POSIX_NGROUPS_MAX		16
-#define _SC_POSIX_OPEN_MAX		17
-#define _SC_POSIX_PATH_MAX		18
-#define _SC_POSIX_PIPE_BUF		19
-#define _SC_POSIX_SSIZE_MAX		20
-#define _SC_POSIX_STREAM_MAX		21
-#define _SC_POSIX_TZNAME_MAX		22
-
-/* POSIX.2 (Draft 10), Table 41)	*/
-
-#define _SC_BC_BASE_MAX			23
-#define _SC_BC_DIM_MAX			24
-#define _SC_BC_SCALE_MAX		25
-#define _SC_BC_STRING_MAX		26
-#define _SC_EQUIV_CLASS_MAX		27
-#define _SC_EXPR_NEST_MAX		28
-#define _SC_LINE_MAX			29
-#define _SC_RE_DUP_MAX			30
-#define _SC_2_VERSION			31
-#define _SC_2_C_DEV			32
-#define _SC_2_FORT_DEV			33
-#define _SC_2_FORT_RUN			34
-#define _SC_2_LOCALEDEF			35
-#define _SC_2_SW_DEV			36
-
-/* POSIX.2 (Draft 10), Table 13)	*/
-
-#define _SC_POSIX2_BC_BASE_MAX		37
-#define _SC_POSIX2_BC_DIM_MAX		38
-#define _SC_POSIX2_BC_SCALE_MAX		39
-#define _SC_POSIX2_BC_STRING_MAX	40
-#define _SC_POSIX2_EQUIV_CLASS_MAX	41
-#define _SC_POSIX2_EXPR_NEST_MAX	42
-#define _SC_POSIX2_LINE_MAX		43
-#define _SC_POSIX2_RE_DUP_MAX		44
-#define _SC_PASS_MAX			45
-#define _SC_XOPEN_VERSION		46
-#define _SC_ATEXIT_MAX			47
-#if _XOPEN_SOURCE_EXTENDED==1
-#define _SC_PAGE_SIZE			48
-#endif /* _XOPEN_SOURCE_EXTENDED */
-#define _SC_AES_OS_VERSION		49
-#define _SC_COLL_WEIGHTS_MAX		50
-#define _SC_2_C_BIND			51
-#define _SC_2_C_VERSION			52
-#define _SC_2_UPE			53
-#define _SC_2_CHAR_TERM			54
-#define _SC_XOPEN_SHM			55
-#define _SC_XOPEN_CRYPT			56
-#define _SC_XOPEN_ENH_I18N		57
-#if _XOPEN_SOURCE_EXTENDED==1
-#define _SC_PAGESIZE			_SC_PAGE_SIZE
-#define _SC_IOV_MAX			58
-#endif /* _XOPEN_SOURCE_EXTENDED */
-#define _SC_THREAD_SAFE_FUNCTIONS	59
-#define _SC_THREADS			60
-#define _SC_THREAD_ATTR_STACKADDR	61
-#define _SC_THREAD_ATTR_STACKSIZE	62
-#define _SC_THREAD_FORKALL		63
-#define _SC_THREAD_PRIORITY_SCHEDULING	64
-#define _SC_THREAD_PRIO_INHERIT		65
-#define _SC_THREAD_PRIO_PROTECT		66
-#define _SC_THREAD_PROCESS_SHARED	67
-#define _SC_THREAD_KEYS_MAX		68
-#define _SC_THREAD_DATAKEYS_MAX		_SC_THREAD_KEYS_MAX
-#define _SC_THREAD_STACK_MIN		69
-#define _SC_THREAD_THREADS_MAX		70
-#ifdef _ALL_SOURCE
-#define _SC_NPROCESSORS_CONF		71
-#define _SC_NPROCESSORS_ONLN		72
-#endif /* _ALL_SOURCE */
-#define _SC_XOPEN_UNIX			73
-
-#if (_XOPEN_SOURCE >= 500)
-
-/* POSIX 1003.1c and XPG UNIX98 */
-/* look to defines above for meanings */
-#define _SC_AIO_LISTIO_MAX			75
-#define _SC_AIO_MAX				76
-#define _SC_AIO_PRIO_DELTA_MAX			77
-#define _SC_ASYNCHRONOUS_IO			78
-#define _SC_DELAYTIMER_MAX			79
-#define _SC_FSYNC				80
-#define _SC_GETGR_R_SIZE_MAX			81
-#define _SC_GETPW_R_SIZE_MAX			82
-#define _SC_LOGIN_NAME_MAX			83
-#define _SC_MAPPED_FILES			84
-#define _SC_MEMLOCK				85
-#define _SC_MEMLOCK_RANGE			86
-#define _SC_MEMORY_PROTECTION			87
-#define _SC_MESSAGE_PASSING			88
-#define _SC_MQ_OPEN_MAX				89
-#define _SC_MQ_PRIO_MAX				90
-#define _SC_PRIORITIZED_IO			91
-#define _SC_PRIORITY_SCHEDULING			92
-#define _SC_REALTIME_SIGNALS			93
-#define _SC_RTSIG_MAX				94
-#define _SC_SEMAPHORES				95
-#define _SC_SEM_NSEMS_MAX			96
-#define _SC_SEM_VALUE_MAX			97
-#define _SC_SHARED_MEMORY_OBJECTS		98
-#define _SC_SIGQUEUE_MAX			99
-#define _SC_SYNCHRONIZED_IO			100
-#define _SC_THREAD_DESTRUCTOR_ITERATIONS	101
-#define _SC_TIMERS				102
-#define _SC_TIMER_MAX				103
-#define _SC_TTY_NAME_MAX			104
-#define _SC_XBS5_ILP32_OFF32			105
-#define _SC_XBS5_ILP32_OFFBIG			106
-#define _SC_XBS5_LP64_OFF64			107
-#define _SC_XBS5_LPBIG_OFFBIG			108
-#define _SC_XOPEN_XCU_VERSION			109
-#define _SC_XOPEN_REALTIME			110
-#define _SC_XOPEN_REALTIME_THREADS		111
-#define _SC_XOPEN_LEGACY			112
-#endif /* _XOPEN_SOURCE >= 500 */
-
-#ifdef _ALL_SOURCE
-#define _SC_REENTRANT_FUNCTIONS		_SC_THREAD_SAFE_FUNCTIONS
-#define _SC_PHYS_PAGES				113
-#define _SC_AVPHYS_PAGES			114
-#define _SC_LPAR_ENABLED			115
-#define _SC_LARGE_PAGESIZE			116
-#endif /* _ALL_SOURCE */
-
-#define _SC_AIX_KERNEL_BITMODE			117
-#define _SC_AIX_REALMEM				118
-#define _SC_AIX_HARDWARE_BITMODE		119
-#define _SC_AIX_MP_CAPABLE			120
-
-#define _SC_V6_ILP32_OFF32			121
-#define _SC_V6_ILP32_OFFBIG			122
-#define _SC_V6_LP64_OFF64			123
-#define _SC_V6_LPBIG_OFFBIG			124
-
-#define _SC_XOPEN_STREAMS			125
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _SC_HOST_NAME_MAX			126
-#define _SC_REGEXP				127
-#define _SC_SHELL				128
-#define _SC_SYMLOOP_MAX				129
-#define _SC_ADVISORY_INFO			130
-#define _SC_FILE_LOCKING			131
-#define _SC_2_PBS				132
-#define _SC_2_PBS_ACCOUNTING			133
-#define _SC_2_PBS_CHECKPOINT			134
-#define _SC_2_PBS_LOCATE			135
-#define _SC_2_PBS_MESSAGE			136
-#define _SC_2_PBS_TRACK				137
-#define _SC_BARRIERS				138
-#define _SC_CLOCK_SELECTION			139
-#define _SC_CPUTIME				140
-#define _SC_MONOTONIC_CLOCK			141
-#define _SC_READER_WRITER_LOCKS			142
-#define _SC_SPAWN				143
-#define _SC_SPIN_LOCKS				144
-#define _SC_SPORADIC_SERVER			145
-#define _SC_THREAD_CPUTIME			146
-#define _SC_THREAD_SPORADIC_SERVER              147
-#define _SC_TIMEOUTS				148
-#define _SC_TRACE				149
-#define _SC_TRACE_EVENT_FILTER			150
-#define _SC_TRACE_INHERIT			151
-#define _SC_TRACE_LOG				152
-#define _SC_TYPED_MEMORY_OBJECTS		153
-#define _SC_IPV6				154
-#define _SC_RAW_SOCKETS				155
-#define _SC_SS_REPL_MAX				156
-#define _SC_TRACE_EVENT_NAME_MAX		157
-#define _SC_TRACE_NAME_MAX			158
-#define _SC_TRACE_SYS_MAX			159
-#define _SC_TRACE_USER_EVENT_MAX		160
-#endif /* _POSIX_C_SOURCE >= 200112L */
-
-#ifdef _ALL_SOURCE
-#define _SC_AIX_UKEYS				161
-#endif /* _ALL_SOURCE */
-
-#endif /* _POSIX_SOURCE */
-
-
-#if _XOPEN_SOURCE_EXTENDED==1
-#ifdef _LARGE_FILES
-#define	ftruncate	ftruncate64
-#define	truncate	truncate64
-#endif
-
-#ifndef _H_LOCKF
-#include <sys/lockf.h>		/* lockf definitions for portability	*/
-#endif
-
-#ifdef _NO_PROTO
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern int		brk();
-extern int              getpagesize();
-#ifndef _MSGQSUPPORT
-extern int		__fd_getdtablesize();
-static int		getdtablesize()
-{
-    return __fd_getdtablesize();
-}
-#else
-extern int              getdtablesize();
-#endif /* _MSGQSUPPORT */
-
-extern void             *sbrk();
-#endif /* _POSIX_C_SOURCE<200112L */
-extern int		fchdir();
-extern int		fchown();
-extern int		ftruncate();
-extern long		gethostid();
-extern int		gethostname();
-extern pid_t		getpgid();
-extern pid_t		getsid();
-extern char		*getwd();
-extern int		lchown();
-extern int		readlink();
-extern pid_t		setpgrp();
-extern int		setregid();
-extern int		setreuid();
-extern int		symlink();
-extern void		sync();
-extern int		truncate();
-extern useconds_t	ualarm();
-extern int		usleep();
-extern pid_t		vfork();
-#else /* _NO_PROTO */
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern int		brk(void *);
-extern int              getpagesize(void);
-#ifndef _MSGQSUPPORT
-extern int		__fd_getdtablesize(void);
-static int		getdtablesize()
-{
-    return __fd_getdtablesize();
-}
-#else
-extern int              getdtablesize(void);
-#endif /* _MSGQSUPPORT */
-#ifdef _LINUX_SOURCE_COMPAT
-extern void             *sbrk(ptrdiff_t);
-#elif (_XOPEN_SOURCE >= 500) || defined(__64BIT__)
-extern void             *sbrk(intptr_t);
-#else
-extern void             *sbrk(int);
-#endif
-#endif /* _POSIX_C_SOURCE<200112L */
-extern int		fchdir(int);
-extern int		fchown(int, uid_t, gid_t);
-extern int		ftruncate(int, off_t);
-#ifdef _LARGE_FILE_API
-extern int		ftruncate64(int, off64_t);
-#endif
-extern int		gethostname(char *, size_t);
-extern long		gethostid(void);
-extern pid_t		getpgid(pid_t);
-extern pid_t		getsid(pid_t);
-extern char		*getwd(char *);
-extern int		lchown(const char *, uid_t, gid_t);
-
-#if (defined(_SUSV3_READLINK) || \
-     (!defined(_ALL_SOURCE) && (_POSIX_C_SOURCE >= 200112L)))
-/* If SUSV3 readlink specifically requested or if strict SUSv3
- * environment requested */
-#ifdef __64BIT__
-static ssize_t readlink(const char *__restrict__ __path,
-                        char *__restrict__ __buf, size_t __bufsize)
-{
-    extern ssize_t __readlink64(const char *__restrict__, char *__restrict__, size_t);
-    return __readlink64(__path, __buf, __bufsize);
-}
-#else
-extern ssize_t readlink(const char *__restrict__, char *__restrict__, size_t);
-#endif /* __64BIT__ */
-#else
-extern int readlink(const char *, char *, size_t);
-#endif /* _SUSV3_READLINK || !_ALL_SOURCE && _POSIX_C_SOURCE >= 200112L */
-
-#ifndef _BSD
-extern pid_t		setpgrp(void);
-#endif /* _BSD */
-extern int		setregid(gid_t, gid_t);
-extern int		setreuid(uid_t, uid_t);
-extern int		symlink(const char *, const char *);
-extern void		sync(void);
-extern int		truncate(const char *, off_t);
-#ifdef _LARGE_FILE_API
-extern int		truncate64(const char *, off64_t);
-#endif
-extern useconds_t	ualarm(useconds_t, useconds_t);
-extern int		usleep(useconds_t);
-extern pid_t		vfork(void);
-#if _XOPEN_SOURCE>=500
-extern int		getlogin_r(char *, size_t);
-extern int		ttyname_r(int, char *, size_t);
-
-#ifdef _LARGE_FILES
-#define pread		pread64
-#define pwrite		pwrite64
-#endif /* _LARGE_FILES */
-
-extern ssize_t		pread(int, void *, size_t, off_t);
-extern ssize_t		pwrite(int, const void *, size_t, off_t);
-#ifdef _LARGE_FILE_API
-extern ssize_t		pread64(int, void *, size_t, off64_t);
-extern ssize_t		pwrite64(int, const void *, size_t, off64_t);
-#endif /* _LARGE_FILE_API */
-#endif /* _XOPEN_SOURCE>=500 */
-
-#endif /* _NO_PROTO */
-
-#endif /* _XOPEN_SOURCE_EXTENDED */
-
-#ifdef _ALL_SOURCE
-
-extern char **environ;
-
-#ifndef _KERNEL
-#ifdef _NO_PROTO
-extern pid_t		f_fork();
-#else /* _NO_PROTO */
-extern pid_t		f_fork(void);
-#endif /* _NO_PROTO */
-#endif	/* _KERNEL */
-
-#ifdef _NO_PROTO
-extern char *		cuserid();
-extern int		ioctl();
-#ifdef __64BIT__
-extern int		ioctlx();
-extern int		ioctl32();
-extern int		ioctl32x();
-#endif /* __64BIT__ */
-extern int		readx();
-extern int		setgroups();
-extern int		writex();
-extern int		setegid();
-extern int		seteuid();
-extern int		setrgid();
-extern int		setruid();
-extern offset_t		llseek();
-extern char *		getusershell();
-extern void		setusershell();
-extern void		endusershell();
-extern char *		get_current_dir_name();
-extern int		sysfs();
-#else
-extern char *		cuserid(char *);
-extern int		setegid(gid_t);
-extern int		seteuid(uid_t);
-extern int		setrgid(gid_t);
-extern int		setruid(uid_t);
-#ifndef _BSD
-extern int		ioctl(int, int, ...);
-#endif /* _BSD */
-#ifdef __64BIT__
-extern int		ioctlx(int, int, void *, long);
-extern int		ioctl32(int, int, ...);
-extern int		ioctl32x(int, int, unsigned int, unsigned int);
-#endif /* __64BIT__ */
-extern int		setgroups(int, gid_t []);
-#ifndef _KERNEL
-extern int	readx(int, char*, unsigned, long);
-extern int	writex(int, char*, unsigned, long);
-
-#ifdef _LARGE_FILES
-#define fclear fclear64
-#define	fsync_range	fsync_range64
-#endif
-extern off_t	fclear(int, off_t);
-extern int	fsync_range(int, int, off_t, off_t);
-#ifdef _LARGE_FILE_API
-extern off64_t	fclear64(int, off64_t);
-extern int	fsync_range64(int, int, off64_t, off64_t);
-#endif
-extern offset_t llseek(int, offset_t, int);
-extern char *	getusershell(void);
-extern void	setusershell(void);
-extern void	endusershell(void);
-extern char *	get_current_dir_name(void);
-extern int	sysfs(int, ...);
-extern int	finfo(const char *, int, void *, int32long64_t);
-extern int	ffinfo(int, int, void *, int32long64_t);
-
-#endif /* ndef _KERNEL */
-
-#endif /* _NO_PROTO */
-
-#define _AES_OS_VERSION 1               /* OSF, AES version */
-
-#endif /* _ALL_SOURCE */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _H_UNISTD */
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/Makefile b/hpvm/test/parboil/benchmarks/kmeans/src/visc/Makefile
deleted file mode 100755
index 5ea2472c7d..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=cluster.ll getopt.ll read_input.ll kmeans_clustering.ll rmse.ll #compute_gold.o
-VISC_OBJS=kmeans.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS= -O2
-APP_CXXFLAGS= -O2
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/README b/hpvm/test/parboil/benchmarks/kmeans/src/visc/README
deleted file mode 100755
index 05c5443196..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/README
+++ /dev/null
@@ -1,9 +0,0 @@
-NOTE: The current Kmeans implementation doesn't use texture/constant memories, and is different from the CUDA implementation.
-
-******Adjustable work group size*****
-RD_WG_SIZE_0 or RD_WG_SIZE_0_0 for kernel_swap
-RD_WG_SIZE_1 or RD_WG_SIZE_1_0 for kernel_kmeans 
-
-USAGE:
-make clean
-make KERNEL_DIM="-DRD_WG_SIZE_0=128 -DRD_WG_SIZE_1=512"
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/cluster.c b/hpvm/test/parboil/benchmarks/kmeans/src/visc/cluster.c
deleted file mode 100644
index 987e874ab3..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/cluster.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         cluster.c                                           **/
-/**   Description:  Takes as input a file, containing 1 data point per  **/
-/**                 per line, and performs a fuzzy c-means clustering   **/
-/**                 on the data. Fuzzy clustering is performed using    **/
-/**                 min to max clusters and the clustering that gets    **/
-/**                 the best score according to a compactness and       **/
-/**                 separation criterion are returned.                  **/
-/**   Author:  Brendan McCane                                           **/
-/**            James Cook University of North Queensland.               **/
-/**            Australia. email: mccane@cs.jcu.edu.au                   **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath, Wei-keng Liao                           **/
-/**              Northwestern University.                               **/
-/**																		**/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <math.h>
-#include <float.h>
-
-#include "kmeans.h"
-
-float	min_rmse_ref = FLT_MAX;
-extern double wtime(void);
-/* reference min_rmse value */
-
-/*---< cluster() >-----------------------------------------------------------*/
-int cluster(int      npoints,				/* number of data points */
-            int      nfeatures,				/* number of attributes for each point */
-            float  **features,			/* array: [npoints][nfeatures] */
-            int      min_nclusters,			/* range of min to max number of clusters */
-            int		 max_nclusters,
-            float    threshold,				/* loop terminating factor */
-            int     *best_nclusters,		/* out: number between min and max with lowest RMSE */
-            float ***cluster_centres,		/* out: [best_nclusters][nfeatures] */
-            float	*min_rmse,				/* out: minimum RMSE */
-            int		 isRMSE,				/* calculate RMSE */
-            int		 nloops					/* number of iteration for each number of clusters */
-           )
-{
-    int		nclusters;						/* number of clusters k */
-    int		index =0;						/* number of iteration to reach the best RMSE */
-    int		rmse;							/* RMSE for each clustering */
-    int    *membership;						/* which cluster a data point belongs to */
-    float **tmp_cluster_centres;			/* hold coordinates of cluster centers */
-    int		i;
-
-    /* allocate memory for membership */
-    membership = (int*) malloc(npoints * sizeof(int));
-
-    /* sweep k from min to max_nclusters to find the best number of clusters */
-    for(nclusters = min_nclusters; nclusters <= max_nclusters; nclusters++)
-    {
-        if (nclusters > npoints) break;	/* cannot have more clusters than points */
-
-        /* allocate device memory, invert data array (@ kmeans_cuda.cu) */
-        allocate(npoints, nfeatures, nclusters, features);
-
-        /* iterate nloops times for each number of clusters */
-        for(i = 0; i < nloops; i++)
-        {
-            /* initialize initial cluster centers, CUDA calls (@ kmeans_cuda.cu) */
-            tmp_cluster_centres = kmeans_clustering(features,
-                                                    nfeatures,
-                                                    npoints,
-                                                    nclusters,
-                                                    threshold,
-                                                    membership);
-
-            if (*cluster_centres) {
-                llvm_visc_untrack_mem((*cluster_centres)[0]);
-                free((*cluster_centres)[0]);
-                free(*cluster_centres);
-            }
-            *cluster_centres = tmp_cluster_centres;
-
-
-            /* find the number of clusters with the best RMSE */
-            if(isRMSE)
-            {
-                rmse = rms_err(features,
-                               nfeatures,
-                               npoints,
-                               tmp_cluster_centres,
-                               nclusters);
-
-                if(rmse < min_rmse_ref) {
-                    min_rmse_ref = rmse;			//update reference min RMSE
-                    *min_rmse = min_rmse_ref;		//update return min RMSE
-                    *best_nclusters = nclusters;	//update optimum number of clusters
-                    index = i;						//update number of iteration to reach best RMSE
-                }
-            }
-        }
-
-        deallocateMemory();							/* free device memory (@ kmeans_cuda.cu) */
-    }
-
-    free(membership);
-
-    return index;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.c b/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.c
deleted file mode 100644
index 23262ae282..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.c
+++ /dev/null
@@ -1,1184 +0,0 @@
-/* Getopt for GNU.
-   NOTE: getopt is now part of the C library, so if you don't know what
-   "Keep this file name-space clean" means, talk to drepper@gnu.org
-   before changing it!
-   Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001
-        Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>.
-   Ditto for AIX 3.2 and <stdlib.h>.  */
-#ifndef _NO_PROTO
-# define _NO_PROTO
-#endif
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#if !defined __STDC__ || !__STDC__
-/* This is a separate conditional since some stdc systems
-   reject `defined (const)'.  */
-# ifndef const
-#  define const
-# endif
-#endif
-
-#include <stdio.h>
-
-/* Comment out all this code if we are using the GNU C Library, and are not
-   actually compiling the library itself.  This code is part of the GNU C
-   Library, but also included in many other GNU distributions.  Compiling
-   and linking in this code is a waste when using the GNU C library
-   (especially if it is a shared library).  Rather than having every GNU
-   program understand `configure --with-gnu-libc' and omit the object files,
-   it is simpler to just do this in the source for each such file.  */
-
-#define GETOPT_INTERFACE_VERSION 2
-#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2
-# include <gnu-versions.h>
-# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION
-#  define ELIDE_CODE
-# endif
-#endif
-
-#ifndef ELIDE_CODE
-
-
-/* This needs to come after some library #include
-   to get __GNU_LIBRARY__ defined.  */
-#ifdef  __GNU_LIBRARY__
-/* Don't include stdlib.h for non-GNU C libraries because some of them
-   contain conflicting prototypes for getopt.  */
-# include <stdlib.h>
-# include <unistd.h>
-#endif  /* GNU C library.  */
-
-#ifdef VMS
-# include <unixlib.h>
-# if HAVE_STRING_H - 0
-#  include <string.h>
-# endif
-#endif
-
-#ifndef _
-/* This is for other GNU distributions with internationalized messages.  */
-# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
-#  include <libintl.h>
-#  ifndef _
-#   define _(msgid)     gettext (msgid)
-#  endif
-# else
-#  define _(msgid)      (msgid)
-# endif
-# if defined _LIBC && defined USE_IN_LIBIO
-#  include <wchar.h>
-# endif
-#endif
-
-/* This version of `getopt' appears to the caller like standard Unix `getopt'
-   but it behaves differently for the user, since it allows the user
-   to intersperse the options with the other arguments.
-
-   As `getopt' works, it permutes the elements of ARGV so that,
-   when it is done, all the options precede everything else.  Thus
-   all application programs are extended to handle flexible argument order.
-
-   Setting the environment variable POSIXLY_CORRECT disables permutation.
-   Then the behavior is completely standard.
-
-   GNU application programs can use a third alternative mode in which
-   they can distinguish the relative order of options and other arguments.  */
-
-#include "getopt.h"
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-/* 1003.2 says this must be 1 before any call.  */
-int optind = 1;
-
-/* Formerly, initialization of getopt depended on optind==0, which
-   causes problems with re-calling getopt as programs generally don't
-   know that. */
-
-int __getopt_initialized;
-
-/* The next char to be scanned in the option-element
-   in which the last option character we returned was found.
-   This allows us to pick up the scan where we left off.
-
-   If this is zero, or a null string, it means resume the scan
-   by advancing to the next ARGV-element.  */
-
-static char *nextchar;
-
-/* Callers store zero here to inhibit the error message
-   for unrecognized options.  */
-
-int opterr = 1;
-
-/* Set to an option character which was unrecognized.
-   This must be initialized on some systems to avoid linking in the
-   system's own getopt implementation.  */
-
-int optopt = '?';
-
-/* Describe how to deal with options that follow non-option ARGV-elements.
-
-   If the caller did not specify anything,
-   the default is REQUIRE_ORDER if the environment variable
-   POSIXLY_CORRECT is defined, PERMUTE otherwise.
-
-   REQUIRE_ORDER means don't recognize them as options;
-   stop option processing when the first non-option is seen.
-   This is what Unix does.
-   This mode of operation is selected by either setting the environment
-   variable POSIXLY_CORRECT, or using `+' as the first character
-   of the list of option characters.
-
-   PERMUTE is the default.  We permute the contents of ARGV as we scan,
-   so that eventually all the non-options are at the end.  This allows options
-   to be given in any order, even with programs that were not written to
-   expect this.
-
-   RETURN_IN_ORDER is an option available to programs that were written
-   to expect options and other ARGV-elements in any order and that care about
-   the ordering of the two.  We describe each non-option ARGV-element
-   as if it were the argument of an option with character code 1.
-   Using `-' as the first character of the list of option characters
-   selects this mode of operation.
-
-   The special argument `--' forces an end of option-scanning regardless
-   of the value of `ordering'.  In the case of RETURN_IN_ORDER, only
-   `--' can cause `getopt' to return -1 with `optind' != ARGC.  */
-
-static enum
-{
-    REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
-} ordering;
-
-/* Value of POSIXLY_CORRECT environment variable.  */
-static char *posixly_correct;
-
-#ifdef  __GNU_LIBRARY__
-/* We want to avoid inclusion of string.h with non-GNU libraries
-   because there are many ways it can cause trouble.
-   On some systems, it contains special magic macros that don't work
-   in GCC.  */
-# include <string.h>
-# define my_index       strchr
-#else
-
-//# if HAVE_STRING_H || WIN32 /* Pete Wilson mod 7/28/02 */
-#  include <string.h>
-//# else
-//#  include <strings.h>
-//# endif
-
-/* Avoid depending on library functions or files
-   whose names are inconsistent.  */
-
-#ifndef getenv
-extern char *getenv ();
-#endif
-
-static char *
-my_index (str, chr)
-const char *str;
-int chr;
-{
-    while (*str)
-    {
-        if (*str == chr)
-            return (char *) str;
-        str++;
-    }
-    return 0;
-}
-
-/* If using GCC, we can safely declare strlen this way.
-   If not using GCC, it is ok not to declare it.  */
-#ifdef __GNUC__
-/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h.
-   That was relevant to code that was here before.  */
-# if (!defined __STDC__ || !__STDC__) && !defined strlen
-/* gcc with -traditional declares the built-in strlen to return int,
-   and has done so at least since version 2.4.5. -- rms.  */
-extern int strlen (const char *);
-# endif /* not __STDC__ */
-#endif /* __GNUC__ */
-
-#endif /* not __GNU_LIBRARY__ */
-
-/* Handle permutation of arguments.  */
-
-/* Describe the part of ARGV that contains non-options that have
-   been skipped.  `first_nonopt' is the index in ARGV of the first of them;
-   `last_nonopt' is the index after the last of them.  */
-
-static int first_nonopt;
-static int last_nonopt;
-
-#ifdef _LIBC
-/* Stored original parameters.
-   XXX This is no good solution.  We should rather copy the args so
-   that we can compare them later.  But we must not use malloc(3).  */
-extern int __libc_argc;
-extern char **__libc_argv;
-
-/* Bash 2.0 gives us an environment variable containing flags
-   indicating ARGV elements that should not be considered arguments.  */
-
-# ifdef USE_NONOPTION_FLAGS
-/* Defined in getopt_init.c  */
-extern char *__getopt_nonoption_flags;
-
-static int nonoption_flags_max_len;
-static int nonoption_flags_len;
-# endif
-
-# ifdef USE_NONOPTION_FLAGS
-#  define SWAP_FLAGS(ch1, ch2) \
-  if (nonoption_flags_len > 0)                                                \
-    {                                                                         \
-      char __tmp = __getopt_nonoption_flags[ch1];                             \
-      __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2];          \
-      __getopt_nonoption_flags[ch2] = __tmp;                                  \
-    }
-# else
-#  define SWAP_FLAGS(ch1, ch2)
-# endif
-#else   /* !_LIBC */
-# define SWAP_FLAGS(ch1, ch2)
-#endif  /* _LIBC */
-
-/* Exchange two adjacent subsequences of ARGV.
-   One subsequence is elements [first_nonopt,last_nonopt)
-   which contains all the non-options that have been skipped so far.
-   The other is elements [last_nonopt,optind), which contains all
-   the options processed since those non-options were skipped.
-
-   `first_nonopt' and `last_nonopt' are relocated so that they describe
-   the new indices of the non-options in ARGV after they are moved.  */
-
-#if defined __STDC__ && __STDC__
-static void exchange (char **);
-#endif
-
-static void
-exchange (argv)
-char **argv;
-{
-    int bottom = first_nonopt;
-    int middle = last_nonopt;
-    int top = optind;
-    char *tem;
-
-    /* Exchange the shorter segment with the far end of the longer segment.
-       That puts the shorter segment into the right place.
-       It leaves the longer segment in the right place overall,
-       but it consists of two parts that need to be swapped next.  */
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-    /* First make sure the handling of the `__getopt_nonoption_flags'
-       string can work normally.  Our top argument must be in the range
-       of the string.  */
-    if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len)
-    {
-        /* We must extend the array.  The user plays games with us and
-           presents new arguments.  */
-        char *new_str = malloc (top + 1);
-        if (new_str == NULL)
-            nonoption_flags_len = nonoption_flags_max_len = 0;
-        else
-        {
-            memset (__mempcpy (new_str, __getopt_nonoption_flags,
-                               nonoption_flags_max_len),
-                    '\0', top + 1 - nonoption_flags_max_len);
-            nonoption_flags_max_len = top + 1;
-            __getopt_nonoption_flags = new_str;
-        }
-    }
-#endif
-
-    while (top > middle && middle > bottom)
-    {
-        if (top - middle > middle - bottom)
-        {
-            /* Bottom segment is the short one.  */
-            int len = middle - bottom;
-            register int i;
-
-            /* Swap it with the top part of the top segment.  */
-            for (i = 0; i < len; i++)
-            {
-                tem = argv[bottom + i];
-                argv[bottom + i] = argv[top - (middle - bottom) + i];
-                argv[top - (middle - bottom) + i] = tem;
-                SWAP_FLAGS (bottom + i, top - (middle - bottom) + i);
-            }
-            /* Exclude the moved bottom segment from further swapping.  */
-            top -= len;
-        }
-        else
-        {
-            /* Top segment is the short one.  */
-            int len = top - middle;
-            register int i;
-
-            /* Swap it with the bottom part of the bottom segment.  */
-            for (i = 0; i < len; i++)
-            {
-                tem = argv[bottom + i];
-                argv[bottom + i] = argv[middle + i];
-                argv[middle + i] = tem;
-                SWAP_FLAGS (bottom + i, middle + i);
-            }
-            /* Exclude the moved top segment from further swapping.  */
-            bottom += len;
-        }
-    }
-
-    /* Update records for the slots the non-options now occupy.  */
-
-    first_nonopt += (optind - last_nonopt);
-    last_nonopt = optind;
-}
-
-/* Initialize the internal data when the first call is made.  */
-
-#if defined __STDC__ && __STDC__
-static const char *_getopt_initialize (int, char *const *, const char *);
-#endif
-static const char *
-_getopt_initialize (argc, argv, optstring)
-int argc;
-char *const *argv;
-const char *optstring;
-{
-    /* Start processing options with ARGV-element 1 (since ARGV-element 0
-       is the program name); the sequence of previously skipped
-       non-option ARGV-elements is empty.  */
-
-    first_nonopt = last_nonopt = optind;
-
-    nextchar = NULL;
-
-    posixly_correct = getenv ("POSIXLY_CORRECT");
-
-    /* Determine how to handle the ordering of options and nonoptions.  */
-
-    if (optstring[0] == '-')
-    {
-        ordering = RETURN_IN_ORDER;
-        ++optstring;
-    }
-    else if (optstring[0] == '+')
-    {
-        ordering = REQUIRE_ORDER;
-        ++optstring;
-    }
-    else if (posixly_correct != NULL)
-        ordering = REQUIRE_ORDER;
-    else
-        ordering = PERMUTE;
-
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-    if (posixly_correct == NULL
-            && argc == __libc_argc && argv == __libc_argv)
-    {
-        if (nonoption_flags_max_len == 0)
-        {
-            if (__getopt_nonoption_flags == NULL
-                    || __getopt_nonoption_flags[0] == '\0')
-                nonoption_flags_max_len = -1;
-            else
-            {
-                const char *orig_str = __getopt_nonoption_flags;
-                int len = nonoption_flags_max_len = strlen (orig_str);
-                if (nonoption_flags_max_len < argc)
-                    nonoption_flags_max_len = argc;
-                __getopt_nonoption_flags =
-                    (char *) malloc (nonoption_flags_max_len);
-                if (__getopt_nonoption_flags == NULL)
-                    nonoption_flags_max_len = -1;
-                else
-                    memset (__mempcpy (__getopt_nonoption_flags, orig_str, len),
-                            '\0', nonoption_flags_max_len - len);
-            }
-        }
-        nonoption_flags_len = nonoption_flags_max_len;
-    }
-    else
-        nonoption_flags_len = 0;
-#endif
-
-    return optstring;
-}
-
-/* Scan elements of ARGV (whose length is ARGC) for option characters
-   given in OPTSTRING.
-
-   If an element of ARGV starts with '-', and is not exactly "-" or "--",
-   then it is an option element.  The characters of this element
-   (aside from the initial '-') are option characters.  If `getopt'
-   is called repeatedly, it returns successively each of the option characters
-   from each of the option elements.
-
-   If `getopt' finds another option character, it returns that character,
-   updating `optind' and `nextchar' so that the next call to `getopt' can
-   resume the scan with the following option character or ARGV-element.
-
-   If there are no more option characters, `getopt' returns -1.
-   Then `optind' is the index in ARGV of the first ARGV-element
-   that is not an option.  (The ARGV-elements have been permuted
-   so that those that are not options now come last.)
-
-   OPTSTRING is a string containing the legitimate option characters.
-   If an option character is seen that is not listed in OPTSTRING,
-   return '?' after printing an error message.  If you set `opterr' to
-   zero, the error message is suppressed but we still return '?'.
-
-   If a char in OPTSTRING is followed by a colon, that means it wants an arg,
-   so the following text in the same ARGV-element, or the text of the following
-   ARGV-element, is returned in `optarg'.  Two colons mean an option that
-   wants an optional arg; if there is text in the current ARGV-element,
-   it is returned in `optarg', otherwise `optarg' is set to zero.
-
-   If OPTSTRING starts with `-' or `+', it requests different methods of
-   handling the non-option ARGV-elements.
-   See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
-
-   Long-named options begin with `--' instead of `-'.
-   Their names may be abbreviated as long as the abbreviation is unique
-   or is an exact match for some defined option.  If they have an
-   argument, it follows the option name in the same ARGV-element, separated
-   from the option name by a `=', or else the in next ARGV-element.
-   When `getopt' finds a long-named option, it returns 0 if that option's
-   `flag' field is nonzero, the value of the option's `val' field
-   if the `flag' field is zero.
-
-   The elements of ARGV aren't really const, because we permute them.
-   But we pretend they're const in the prototype to be compatible
-   with other systems.
-
-   LONGOPTS is a vector of `struct option' terminated by an
-   element containing a name which is zero.
-
-   LONGIND returns the index in LONGOPT of the long-named option found.
-   It is only valid when a long-named option has been found by the most
-   recent call.
-
-   If LONG_ONLY is nonzero, '-' as well as '--' can introduce
-   long-named options.  */
-
-int
-_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
-int argc;
-char *const *argv;
-const char *optstring;
-const struct option *longopts;
-int *longind;
-int long_only;
-{
-    int print_errors = opterr;
-    if (optstring[0] == ':')
-        print_errors = 0;
-
-    if (argc < 1)
-        return -1;
-
-    optarg = NULL;
-
-    if (optind == 0 || !__getopt_initialized)
-    {
-        if (optind == 0)
-            optind = 1;     /* Don't scan ARGV[0], the program name.  */
-        optstring = _getopt_initialize (argc, argv, optstring);
-        __getopt_initialized = 1;
-    }
-
-    /* Test whether ARGV[optind] points to a non-option argument.
-       Either it does not have option syntax, or there is an environment flag
-       from the shell indicating it is not an option.  The later information
-       is only used when the used in the GNU libc.  */
-#if defined _LIBC && defined USE_NONOPTION_FLAGS
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0'       \
-                      || (optind < nonoption_flags_len                        \
-                          && __getopt_nonoption_flags[optind] == '1'))
-#else
-# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0')
-#endif
-
-    if (nextchar == NULL || *nextchar == '\0')
-    {
-        /* Advance to the next ARGV-element.  */
-
-        /* Give FIRST_NONOPT and LAST_NONOPT rational values if OPTIND has been
-           moved back by the user (who may also have changed the arguments).  */
-        if (last_nonopt > optind)
-            last_nonopt = optind;
-        if (first_nonopt > optind)
-            first_nonopt = optind;
-
-        if (ordering == PERMUTE)
-        {
-            /* If we have just processed some options following some non-options,
-               exchange them so that the options come first.  */
-
-            if (first_nonopt != last_nonopt && last_nonopt != optind)
-                exchange ((char **) argv);
-            else if (last_nonopt != optind)
-                first_nonopt = optind;
-
-            /* Skip any additional non-options
-               and extend the range of non-options previously skipped.  */
-
-            while (optind < argc && NONOPTION_P)
-                optind++;
-            last_nonopt = optind;
-        }
-
-        /* The special ARGV-element `--' means premature end of options.
-           Skip it like a null option,
-           then exchange with previous non-options as if it were an option,
-           then skip everything else like a non-option.  */
-
-        if (optind != argc && !strcmp (argv[optind], "--"))
-        {
-            optind++;
-
-            if (first_nonopt != last_nonopt && last_nonopt != optind)
-                exchange ((char **) argv);
-            else if (first_nonopt == last_nonopt)
-                first_nonopt = optind;
-            last_nonopt = argc;
-
-            optind = argc;
-        }
-
-        /* If we have done all the ARGV-elements, stop the scan
-           and back over any non-options that we skipped and permuted.  */
-
-        if (optind == argc)
-        {
-            /* Set the next-arg-index to point at the non-options
-               that we previously skipped, so the caller will digest them.  */
-            if (first_nonopt != last_nonopt)
-                optind = first_nonopt;
-            return -1;
-        }
-
-        /* If we have come to a non-option and did not permute it,
-           either stop the scan or describe it to the caller and pass it by.  */
-
-        if (NONOPTION_P)
-        {
-            if (ordering == REQUIRE_ORDER)
-                return -1;
-            optarg = argv[optind++];
-            return 1;
-        }
-
-        /* We have found another option-ARGV-element.
-           Skip the initial punctuation.  */
-
-        nextchar = (argv[optind] + 1
-                    + (longopts != NULL && argv[optind][1] == '-'));
-    }
-
-    /* Decode the current option-ARGV-element.  */
-
-    /* Check whether the ARGV-element is a long option.
-
-       If long_only and the ARGV-element has the form "-f", where f is
-       a valid short option, don't consider it an abbreviated form of
-       a long option that starts with f.  Otherwise there would be no
-       way to give the -f short option.
-
-       On the other hand, if there's a long option "fubar" and
-       the ARGV-element is "-fu", do consider that an abbreviation of
-       the long option, just like "--fu", and not "-f" with arg "u".
-
-       This distinction seems to be the most useful approach.  */
-
-    if (longopts != NULL
-            && (argv[optind][1] == '-'
-                || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1])))))
-    {
-        char *nameend;
-        const struct option *p;
-        const struct option *pfound = NULL;
-        int exact = 0;
-        int ambig = 0;
-        int indfound = -1;
-        int option_index;
-
-        for (nameend = nextchar; *nameend && *nameend != '='; nameend++)
-            /* Do nothing.  */ ;
-
-        /* Test all long options for either exact match
-           or abbreviated matches.  */
-        for (p = longopts, option_index = 0; p->name; p++, option_index++)
-            if (!strncmp (p->name, nextchar, nameend - nextchar))
-            {
-                if ((unsigned int) (nameend - nextchar)
-                        == (unsigned int) strlen (p->name))
-                {
-                    /* Exact match found.  */
-                    pfound = p;
-                    indfound = option_index;
-                    exact = 1;
-                    break;
-                }
-                else if (pfound == NULL)
-                {
-                    /* First nonexact match found.  */
-                    pfound = p;
-                    indfound = option_index;
-                }
-                else if (long_only
-                         || pfound->has_arg != p->has_arg
-                         || pfound->flag != p->flag
-                         || pfound->val != p->val)
-                    /* Second or later nonexact match found.  */
-                    ambig = 1;
-            }
-
-        if (ambig && !exact)
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-
-                __asprintf (&buf, _("%s: option `%s' is ambiguous\n"),
-                            argv[0], argv[optind]);
-
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#else
-                fprintf (stderr, _("%s: option `%s' is ambiguous\n"),
-                         argv[0], argv[optind]);
-#endif
-            }
-            nextchar += strlen (nextchar);
-            optind++;
-            optopt = 0;
-            return '?';
-        }
-
-        if (pfound != NULL)
-        {
-            option_index = indfound;
-            optind++;
-            if (*nameend)
-            {
-                /* Don't test has_arg with >, because some C compilers don't
-                   allow it to be used on enums.  */
-                if (pfound->has_arg)
-                    optarg = nameend + 1;
-                else
-                {
-                    if (print_errors)
-                    {
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-#endif
-
-                        if (argv[optind - 1][1] == '-')
-                        {
-                            /* --option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                            __asprintf (&buf, _("\
-%s: option `--%s' doesn't allow an argument\n"),
-                                        argv[0], pfound->name);
-#else
-                            fprintf (stderr, _("\
-%s: option `--%s' doesn't allow an argument\n"),
-                                     argv[0], pfound->name);
-#endif
-                        }
-                        else
-                        {
-                            /* +option or -option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                            __asprintf (&buf, _("\
-%s: option `%c%s' doesn't allow an argument\n"),
-                                        argv[0], argv[optind - 1][0],
-                                        pfound->name);
-#else
-                            fprintf (stderr, _("\
-%s: option `%c%s' doesn't allow an argument\n"),
-                                     argv[0], argv[optind - 1][0], pfound->name);
-#endif
-                        }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#endif
-                    }
-
-                    nextchar += strlen (nextchar);
-
-                    optopt = pfound->val;
-                    return '?';
-                }
-            }
-            else if (pfound->has_arg == 1)
-            {
-                if (optind < argc)
-                    optarg = argv[optind++];
-                else
-                {
-                    if (print_errors)
-                    {
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-
-                        __asprintf (&buf,
-                                    _("%s: option `%s' requires an argument\n"),
-                                    argv[0], argv[optind - 1]);
-
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#else
-                        fprintf (stderr,
-                                 _("%s: option `%s' requires an argument\n"),
-                                 argv[0], argv[optind - 1]);
-#endif
-                    }
-                    nextchar += strlen (nextchar);
-                    optopt = pfound->val;
-                    return optstring[0] == ':' ? ':' : '?';
-                }
-            }
-            nextchar += strlen (nextchar);
-            if (longind != NULL)
-                *longind = option_index;
-            if (pfound->flag)
-            {
-                *(pfound->flag) = pfound->val;
-                return 0;
-            }
-            return pfound->val;
-        }
-
-        /* Can't find it as a long option.  If this is not getopt_long_only,
-           or the option starts with '--' or is not a valid short
-           option, then it's an error.
-           Otherwise interpret it as a short option.  */
-        if (!long_only || argv[optind][1] == '-'
-                || my_index (optstring, *nextchar) == NULL)
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-#endif
-
-                if (argv[optind][1] == '-')
-                {
-                    /* --option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: unrecognized option `--%s'\n"),
-                                argv[0], nextchar);
-#else
-                    fprintf (stderr, _("%s: unrecognized option `--%s'\n"),
-                             argv[0], nextchar);
-#endif
-                }
-                else
-                {
-                    /* +option or -option */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: unrecognized option `%c%s'\n"),
-                                argv[0], argv[optind][0], nextchar);
-#else
-                    fprintf (stderr, _("%s: unrecognized option `%c%s'\n"),
-                             argv[0], argv[optind][0], nextchar);
-#endif
-                }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#endif
-            }
-            nextchar = (char *) "";
-            optind++;
-            optopt = 0;
-            return '?';
-        }
-    }
-
-    /* Look at and handle the next short option-character.  */
-
-    {
-        char c = *nextchar++;
-        char *temp = my_index (optstring, c);
-
-        /* Increment `optind' when we start to process its last character.  */
-        if (*nextchar == '\0')
-            ++optind;
-
-        if (temp == NULL || c == ':')
-        {
-            if (print_errors)
-            {
-#if defined _LIBC && defined USE_IN_LIBIO
-                char *buf;
-#endif
-
-                if (posixly_correct)
-                {
-                    /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: illegal option -- %c\n"),
-                                argv[0], c);
-#else
-                    fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c);
-#endif
-                }
-                else
-                {
-#if defined _LIBC && defined USE_IN_LIBIO
-                    __asprintf (&buf, _("%s: invalid option -- %c\n"),
-                                argv[0], c);
-#else
-                    fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c);
-#endif
-                }
-
-#if defined _LIBC && defined USE_IN_LIBIO
-                if (_IO_fwide (stderr, 0) > 0)
-                    __fwprintf (stderr, L"%s", buf);
-                else
-                    fputs (buf, stderr);
-
-                free (buf);
-#endif
-            }
-            optopt = c;
-            return '?';
-        }
-        /* Convenience. Treat POSIX -W foo same as long option --foo */
-        if (temp[0] == 'W' && temp[1] == ';')
-        {
-            char *nameend;
-            const struct option *p;
-            const struct option *pfound = NULL;
-            int exact = 0;
-            int ambig = 0;
-            int indfound = 0;
-            int option_index;
-
-            /* This is an option that requires an argument.  */
-            if (*nextchar != '\0')
-            {
-                optarg = nextchar;
-                /* If we end this ARGV-element by taking the rest as an arg,
-                   we must advance to the next element now.  */
-                optind++;
-            }
-            else if (optind == argc)
-            {
-                if (print_errors)
-                {
-                    /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                    char *buf;
-
-                    __asprintf (&buf, _("%s: option requires an argument -- %c\n"),
-                                argv[0], c);
-
-                    if (_IO_fwide (stderr, 0) > 0)
-                        __fwprintf (stderr, L"%s", buf);
-                    else
-                        fputs (buf, stderr);
-
-                    free (buf);
-#else
-                    fprintf (stderr, _("%s: option requires an argument -- %c\n"),
-                             argv[0], c);
-#endif
-                }
-                optopt = c;
-                if (optstring[0] == ':')
-                    c = ':';
-                else
-                    c = '?';
-                return c;
-            }
-            else
-                /* We already incremented `optind' once;
-                   increment it again when taking next ARGV-elt as argument.  */
-                optarg = argv[optind++];
-
-            /* optarg is now the argument, see if it's in the
-               table of longopts.  */
-
-            for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++)
-                /* Do nothing.  */ ;
-
-            /* Test all long options for either exact match
-               or abbreviated matches.  */
-            for (p = longopts, option_index = 0; p->name; p++, option_index++)
-                if (!strncmp (p->name, nextchar, nameend - nextchar))
-                {
-                    if ((unsigned int) (nameend - nextchar) == strlen (p->name))
-                    {
-                        /* Exact match found.  */
-                        pfound = p;
-                        indfound = option_index;
-                        exact = 1;
-                        break;
-                    }
-                    else if (pfound == NULL)
-                    {
-                        /* First nonexact match found.  */
-                        pfound = p;
-                        indfound = option_index;
-                    }
-                    else
-                        /* Second or later nonexact match found.  */
-                        ambig = 1;
-                }
-            if (ambig && !exact)
-            {
-                if (print_errors)
-                {
-#if defined _LIBC && defined USE_IN_LIBIO
-                    char *buf;
-
-                    __asprintf (&buf, _("%s: option `-W %s' is ambiguous\n"),
-                                argv[0], argv[optind]);
-
-                    if (_IO_fwide (stderr, 0) > 0)
-                        __fwprintf (stderr, L"%s", buf);
-                    else
-                        fputs (buf, stderr);
-
-                    free (buf);
-#else
-                    fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"),
-                             argv[0], argv[optind]);
-#endif
-                }
-                nextchar += strlen (nextchar);
-                optind++;
-                return '?';
-            }
-            if (pfound != NULL)
-            {
-                option_index = indfound;
-                if (*nameend)
-                {
-                    /* Don't test has_arg with >, because some C compilers don't
-                       allow it to be used on enums.  */
-                    if (pfound->has_arg)
-                        optarg = nameend + 1;
-                    else
-                    {
-                        if (print_errors)
-                        {
-#if defined _LIBC && defined USE_IN_LIBIO
-                            char *buf;
-
-                            __asprintf (&buf, _("\
-%s: option `-W %s' doesn't allow an argument\n"),
-                                        argv[0], pfound->name);
-
-                            if (_IO_fwide (stderr, 0) > 0)
-                                __fwprintf (stderr, L"%s", buf);
-                            else
-                                fputs (buf, stderr);
-
-                            free (buf);
-#else
-                            fprintf (stderr, _("\
-%s: option `-W %s' doesn't allow an argument\n"),
-                                     argv[0], pfound->name);
-#endif
-                        }
-
-                        nextchar += strlen (nextchar);
-                        return '?';
-                    }
-                }
-                else if (pfound->has_arg == 1)
-                {
-                    if (optind < argc)
-                        optarg = argv[optind++];
-                    else
-                    {
-                        if (print_errors)
-                        {
-#if defined _LIBC && defined USE_IN_LIBIO
-                            char *buf;
-
-                            __asprintf (&buf, _("\
-%s: option `%s' requires an argument\n"),
-                                        argv[0], argv[optind - 1]);
-
-                            if (_IO_fwide (stderr, 0) > 0)
-                                __fwprintf (stderr, L"%s", buf);
-                            else
-                                fputs (buf, stderr);
-
-                            free (buf);
-#else
-                            fprintf (stderr,
-                                     _("%s: option `%s' requires an argument\n"),
-                                     argv[0], argv[optind - 1]);
-#endif
-                        }
-                        nextchar += strlen (nextchar);
-                        return optstring[0] == ':' ? ':' : '?';
-                    }
-                }
-                nextchar += strlen (nextchar);
-                if (longind != NULL)
-                    *longind = option_index;
-                if (pfound->flag)
-                {
-                    *(pfound->flag) = pfound->val;
-                    return 0;
-                }
-                return pfound->val;
-            }
-            nextchar = NULL;
-            return 'W';   /* Let the application handle it.   */
-        }
-        if (temp[1] == ':')
-        {
-            if (temp[2] == ':')
-            {
-                /* This is an option that accepts an argument optionally.  */
-                if (*nextchar != '\0')
-                {
-                    optarg = nextchar;
-                    optind++;
-                }
-                else
-                    optarg = NULL;
-                nextchar = NULL;
-            }
-            else
-            {
-                /* This is an option that requires an argument.  */
-                if (*nextchar != '\0')
-                {
-                    optarg = nextchar;
-                    /* If we end this ARGV-element by taking the rest as an arg,
-                       we must advance to the next element now.  */
-                    optind++;
-                }
-                else if (optind == argc)
-                {
-                    if (print_errors)
-                    {
-                        /* 1003.2 specifies the format of this message.  */
-#if defined _LIBC && defined USE_IN_LIBIO
-                        char *buf;
-
-                        __asprintf (&buf,
-                                    _("%s: option requires an argument -- %c\n"),
-                                    argv[0], c);
-
-                        if (_IO_fwide (stderr, 0) > 0)
-                            __fwprintf (stderr, L"%s", buf);
-                        else
-                            fputs (buf, stderr);
-
-                        free (buf);
-#else
-                        fprintf (stderr,
-                                 _("%s: option requires an argument -- %c\n"),
-                                 argv[0], c);
-#endif
-                    }
-                    optopt = c;
-                    if (optstring[0] == ':')
-                        c = ':';
-                    else
-                        c = '?';
-                }
-                else
-                    /* We already incremented `optind' once;
-                       increment it again when taking next ARGV-elt as argument.  */
-                    optarg = argv[optind++];
-                nextchar = NULL;
-            }
-        }
-        return c;
-    }
-}
-
-int
-getopt (argc, argv, optstring)
-int argc;
-char *const *argv;
-const char *optstring;
-{
-    return _getopt_internal (argc, argv, optstring,
-                             (const struct option *) 0,
-                             (int *) 0,
-                             0);
-}
-
-#endif  /* Not ELIDE_CODE.  */
-
-
-/* Compile with -DTEST to make an executable for use in testing
-   the above definition of `getopt'.  */
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.h b/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.h
deleted file mode 100644
index 943432ce30..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/getopt.h
+++ /dev/null
@@ -1,191 +0,0 @@
-
-
-/* getopt.h */
-/* Declarations for getopt.
-   Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
-   Foundation, Inc. This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute
-   it and/or modify it under the terms of the GNU Lesser
-   General Public License as published by the Free Software
-   Foundation; either version 2.1 of the License, or
-   (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will
-   be useful, but WITHOUT ANY WARRANTY; without even the
-   implied warranty of MERCHANTABILITY or FITNESS FOR A
-   PARTICULAR PURPOSE.  See the GNU Lesser General Public
-   License for more details.
-
-   You should have received a copy of the GNU Lesser General
-   Public License along with the GNU C Library; if not, write
-   to the Free Software Foundation, Inc., 59 Temple Place,
-   Suite 330, Boston, MA 02111-1307 USA.  */
-
-
-
-
-
-#ifndef _GETOPT_H
-
-#ifndef __need_getopt
-# define _GETOPT_H 1
-#endif
-
-/* If __GNU_LIBRARY__ is not already defined, either we are being used
-   standalone, or this is the first header included in the source file.
-   If we are being used with glibc, we need to include <features.h>, but
-   that does not exist if we are standalone.  So: if __GNU_LIBRARY__ is
-   not defined, include <ctype.h>, which will pull in <features.h> for us
-   if it's from glibc.  (Why ctype.h?  It's guaranteed to exist and it
-   doesn't flood the namespace with stuff the way some other headers do.)  */
-#if !defined __GNU_LIBRARY__
-# include <ctype.h>
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-extern char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns -1, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-extern int optind;
-
-/* Callers store zero here to inhibit the error message `getopt' prints
-   for unrecognized options.  */
-
-extern int opterr;
-
-/* Set to an option character which was unrecognized.  */
-
-extern int optopt;
-
-#ifndef __need_getopt
-/* Describe the long-named options requested by the application.
-   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
-   of `struct option' terminated by an element containing a name which is
-   zero.
-
-   The field `has_arg' is:
-   no_argument          (or 0) if the option does not take an argument,
-   required_argument    (or 1) if the option requires an argument,
-   optional_argument    (or 2) if the option takes an optional argument.
-
-   If the field `flag' is not NULL, it points to a variable that is set
-   to the value given in the field `val' when the option is found, but
-   left unchanged if the option is not found.
-
-   To have a long-named option do something other than set an `int' to
-   a compiled-in constant, such as set a value from `optarg', set the
-   option's `flag' field to zero and its `val' field to a nonzero
-   value (the equivalent single-letter option character, if there is
-   one).  For long options that have a zero `flag' field, `getopt'
-   returns the contents of the `val' field.  */
-
-struct option
-{
-# if (defined __STDC__ && __STDC__) || defined __cplusplus
-    const char *name;
-# else
-    char *name;
-# endif
-    /* has_arg can't be an enum because some compilers complain about
-       type mismatches in all the code that assumes it is an int.  */
-    int has_arg;
-    int *flag;
-    int val;
-};
-
-/* Names for the values of the `has_arg' field of `struct option'.  */
-
-# define no_argument            0
-# define required_argument      1
-# define optional_argument      2
-#endif  /* need getopt */
-
-
-/* Get definitions and prototypes for functions to process the
-   arguments in ARGV (ARGC of them, minus the program name) for
-   options given in OPTS.
-
-   Return the option character from OPTS just read.  Return -1 when
-   there are no more options.  For unrecognized options, or options
-   missing arguments, `optopt' is set to the option letter, and '?' is
-   returned.
-
-   The OPTS string is a list of characters which are recognized option
-   letters, optionally followed by colons, specifying that that letter
-   takes an argument, to be placed in `optarg'.
-
-   If a letter in OPTS is followed by two colons, its argument is
-   optional.  This behavior is specific to the GNU `getopt'.
-
-   The argument `--' causes premature termination of argument
-   scanning, explicitly telling `getopt' that there are no more
-   options.
-
-   If OPTS begins with `--', then non-option arguments are treated as
-   arguments to the option '\0'.  This behavior is specific to the GNU
-   `getopt'.  */
-
-#if (defined __STDC__ && __STDC__) || defined __cplusplus
-# ifdef __GNU_LIBRARY__
-/* Many other libraries have conflicting prototypes for getopt, with
-   differences in the consts, in stdlib.h.  To avoid compilation
-   errors, only prototype getopt for the GNU C library.  */
-extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
-# else /* not __GNU_LIBRARY__ */
-extern int getopt ();
-# endif /* __GNU_LIBRARY__ */
-
-# ifndef __need_getopt
-extern int getopt_long (int ___argc, char *const *___argv,
-                        const char *__shortopts,
-                        const struct option *__longopts, int *__longind);
-extern int getopt_long_only (int ___argc, char *const *___argv,
-                             const char *__shortopts,
-                             const struct option *__longopts, int *__longind);
-
-/* Internal only.  Users should not call this directly.  */
-extern int _getopt_internal (int ___argc, char *const *___argv,
-                             const char *__shortopts,
-                             const struct option *__longopts, int *__longind,
-                             int __long_only);
-# endif
-#else /* not __STDC__ */
-extern int getopt ();
-# ifndef __need_getopt
-extern int getopt_long ();
-extern int getopt_long_only ();
-
-extern int _getopt_internal ();
-# endif
-#endif /* __STDC__ */
-
-#ifdef  __cplusplus
-}
-#endif
-
-/* Make sure we later can get all the definitions and declarations.  */
-#undef __need_getopt
-
-#endif /* getopt.h */
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cl b/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cl
deleted file mode 100644
index 88dd2c2e21..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-__kernel void
-kmeans_kernel_c(__global float  *feature,
-                __global float  *clusters,
-                __global int    *membership,
-                int     npoints,
-                int     nclusters,
-                int     nfeatures,
-                int		offset,
-                int		size
-               )
-{
-    unsigned int point_id = get_global_id(0);
-    int index = 0;
-    //const unsigned int point_id = get_global_id(0);
-    if (point_id < npoints)
-    {
-        float min_dist=FLT_MAX;
-        for (int i=0; i < nclusters; i++) {
-
-            float dist = 0;
-            float ans  = 0;
-            for (int l=0; l<nfeatures; l++) {
-                ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
-                       (feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
-            }
-
-            dist = ans;
-            if (dist < min_dist) {
-                min_dist = dist;
-                index    = i;
-
-            }
-        }
-        //printf("%d\n", index);
-        membership[point_id] = index;
-    }
-
-    return;
-}
-
-__kernel void
-kmeans_swap(__global float  *feature,
-            __global float  *feature_swap,
-            int     npoints,
-            int     nfeatures
-           ) {
-
-    unsigned int tid = get_global_id(0);
-    for(int i = 0; i <  nfeatures; i++)
-        feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
-
-}
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cpp b/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cpp
deleted file mode 100644
index ac6c8d11a1..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <iostream>
-#include <string>
-#include "kmeans.h"
-
-#include <parboil.h>
-#include <visc.h>
-
-#ifdef WIN
-#include <windows.h>
-#else
-#include <pthread.h>
-#include <sys/time.h>
-double gettime() {
-    struct timeval t;
-    gettimeofday(&t,NULL);
-    return t.tv_sec+t.tv_usec*1e-6;
-}
-#endif
-
-
-#ifdef NV
-#else
-#endif
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-#ifdef RD_WG_SIZE_0_0
-#define BLOCK_SIZE RD_WG_SIZE_0_0
-#elif defined(RD_WG_SIZE_0)
-#define BLOCK_SIZE RD_WG_SIZE_0
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE RD_WG_SIZE
-#else
-#define BLOCK_SIZE 256
-#endif
-
-#ifdef RD_WG_SIZE_1_0
-#define BLOCK_SIZE2 RD_WG_SIZE_1_0
-#elif defined(RD_WG_SIZE_1)
-#define BLOCK_SIZE2 RD_WG_SIZE_1
-#elif defined(RD_WG_SIZE)
-#define BLOCK_SIZE2 RD_WG_SIZE
-#else
-#define BLOCK_SIZE2 256
-#endif
-
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-void
-kmeans_kernel_c(float  *feature,
-                float  *clusters,
-                int    *membership,
-                int     npoints,
-                int     nclusters,
-                int     nfeatures,
-                int		offset,
-                int		size
-               )
-{
-    __visc__attributes(3, feature, clusters, membership, 1, membership);
-    unsigned int point_id = get_global_id(0);
-    int index = 0;
-    //const unsigned int point_id = get_global_id(0);
-    if (point_id < npoints)
-    {
-        float min_dist=FLT_MAX;
-        for (int i=0; i < nclusters; i++) {
-
-            float dist = 0;
-            float ans  = 0;
-            for (int l=0; l<nfeatures; l++) {
-                ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
-                       (feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
-            }
-
-            dist = ans;
-            if (dist < min_dist) {
-                min_dist = dist;
-                index    = i;
-
-            }
-        }
-        //printf("%d\n", index);
-        membership[point_id] = index;
-    }
-
-}
-
-void
-kmeans_swap(float  *feature,
-            float  *feature_swap,
-            int     npoints,
-            int     nfeatures
-           ) {
-    __visc__attributes(2, feature, feature_swap, 1, feature_swap);
-
-    unsigned int tid = get_global_id(0);
-    for(int i = 0; i <  nfeatures; i++)
-        feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
-
-}
-
-// local variables
-
-
-int   *membership_OCL;
-int   *membership_d;
-float *feature_d;
-float *clusters_d;
-float *center_d;
-
-float* feature_swap_d;
-
-static struct pb_TimerSet timers;
-
-__attribute__((noinline)) int allocate(int n_points, int n_features, int n_clusters, float **feature)
-{
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t bytes = n_points * n_features * sizeof(float);
-    feature_swap_d = (float*)malloc (bytes);
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(feature_swap_d, bytes);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    unsigned global_work[3] = { n_points, 1, 1 };
-    /// Ke Wang adjustable local group size 2013/08/07 10:37:33
-    unsigned local_work_size= BLOCK_SIZE; // work group size is defined by RD_WG_SIZE_0 or RD_WG_SIZE_0_0 2014/06/10 17:00:51
-    if(global_work[0]%local_work_size !=0)
-        global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    unsigned swapDFG = __visc__node(kmeans_swap, 2, 1, local_work_size,
-        global_work[0]/local_work_size, 6, feature[0], bytes, (float*)
-        feature_swap_d, bytes, n_points, n_features, 0);
-    __visc__wait(swapDFG);
-
-    //llvm_visc_request_mem(feature_swap_d, n_points * n_features * sizeof(float));
-    //for(int i = 0; i < n_points * n_features; i++) {
-      //std::cout << "feature_swap[" <<i << "] = " << feature_swap_d[i] << "\n";
-    //}
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    membership_OCL = (int*) malloc(n_points * sizeof(int));
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(membership_OCL, n_points * sizeof(int));
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    return 0;
-}
-
-void deallocateMemory()
-{
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    free(feature_swap_d);
-    free(membership_OCL);
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-    llvm_visc_untrack_mem(feature_swap_d);
-    llvm_visc_untrack_mem(membership_OCL);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-}
-
-
-int main( int argc, char** argv)
-{
-    
-    printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n", BLOCK_SIZE, BLOCK_SIZE2);
-
-    setup(argc, argv);
-}
-
-__attribute__((noinline)) int kmeansOCL(float **feature,    /* in: [npoints][nfeatures] */
-              int     n_features,
-              int     n_points,
-              int     n_clusters,
-              int    *membership,
-              float **clusters,
-              int     *new_centers_len,
-              float  **new_centers)
-{
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int delta = 0;
-    int i, j, k;
-
-    unsigned global_work[3] = { n_points, 1, 1 };
-
-    /// Ke Wang adjustable local group size 2013/08/07 10:37:33
-    unsigned local_work_size=BLOCK_SIZE2; // work group size is defined by RD_WG_SIZE_1 or RD_WG_SIZE_1_0 2014/06/10 17:00:41
-    if(global_work[0]%local_work_size !=0)
-        global_work[0]=(global_work[0]/local_work_size+1)*local_work_size;
-
-    int size = 0;
-    int offset = 0;
-    size_t bytes = n_clusters * n_features * sizeof(float);
-    size_t bytes_feature = n_points * n_features * sizeof(float);
-    size_t bytes_membership = n_points * sizeof(int);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    unsigned kmeansDFG = __visc__node(kmeans_kernel_c, 2, 1, local_work_size,
-        global_work[0]/local_work_size, 11, feature_swap_d, bytes_feature, clusters[0],
-        bytes, membership_OCL, bytes_membership, n_points, n_clusters, n_features,
-        offset, size, 0);
-    __visc__wait(kmeansDFG);
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    llvm_visc_request_mem(membership_OCL, bytes_membership); // read membership into membership_OCL
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    delta = 0;
-    for (i = 0; i < n_points; i++)
-    {
-        int cluster_id = membership_OCL[i];
-        new_centers_len[cluster_id]++;
-        if (membership_OCL[i] != membership[i])
-        {
-            delta++;
-            membership[i] = membership_OCL[i];
-        }
-        for (j = 0; j < n_features; j++)
-        {
-            new_centers[cluster_id][j] += feature[i][j];
-        }
-    }
-
-    return delta;
-}
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.h b/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.h
deleted file mode 100644
index 0397992475..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-#ifndef _H_FUZZY_KMEANS
-#define _H_FUZZY_KMEANS
-
-#ifndef FLT_MAX
-#define FLT_MAX 3.40282347e+38
-#endif
-
-/* rmse.c */
-float   euclid_dist_2        (float*, float*, int);
-int     find_nearest_point   (float* , int, float**, int);
-float	rms_err(float**, int, int, float**, int);
-int     cluster(int, int, float**, int, int, float, int*, float***, float*, int, int);
-#ifdef __cplusplus
-extern "C" {
-#endif
-  int setup(int argc, char** argv);
-  int allocate(int npoints, int nfeatures, int nclusters, float **feature);
-  void deallocateMemory();
-  int kmeansOCL(float **feature, int nfeatures, int npoints, int nclusters, int *membership, float **clusters, int *new_centers_len, float  **new_centers);
-#ifdef __cplusplus
-}
-#endif
-float** kmeans_clustering(float **feature, int nfeatures, int npoints, int nclusters, float threshold, int *membership);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans_clustering.c b/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans_clustering.c
deleted file mode 100644
index 173ffa5e5a..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/kmeans_clustering.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         kmeans_clustering.c                                 **/
-/**   Description:  Implementation of regular k-means clustering        **/
-/**                 algorithm                                           **/
-/**   Author:  Wei-keng Liao                                            **/
-/**            ECE Department, Northwestern University                  **/
-/**            email: wkliao@ece.northwestern.edu                       **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath                                          **/
-/**              Northwestern University.                               **/
-/**                                                                     **/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <math.h>
-
-#include "kmeans.h"
-#include <visc.h>
-
-#define RANDOM_MAX 2147483647
-
-extern double wtime(void);
-
-/*----< kmeans_clustering() >---------------------------------------------*/
-float** kmeans_clustering(float **feature,    /* in: [npoints][nfeatures] */
-                          int     nfeatures,
-                          int     npoints,
-                          int     nclusters,
-                          float   threshold,
-                          int    *membership) /* out: [npoints] */
-{
-    int      i, j, n = 0;				/* counters */
-    int		 loop=0, temp;
-    int     *new_centers_len;	/* [nclusters]: no. of points in each cluster */
-    float    delta;				/* if the point moved */
-    float  **clusters;			/* out: [nclusters][nfeatures] */
-    float  **new_centers;		/* [nclusters][nfeatures] */
-
-    int     *initial;			/* used to hold the index of points not yet selected
-								   prevents the "birthday problem" of dual selection (?)
-								   considered holding initial cluster indices, but changed due to
-								   possible, though unlikely, infinite loops */
-    int      initial_points;
-    int		 c = 0;
-
-    /* nclusters should never be > npoints
-       that would guarantee a cluster without points */
-    if (nclusters > npoints)
-        nclusters = npoints;
-
-    /* allocate space for and initialize returning variable clusters[] */
-    clusters    = (float**) malloc(nclusters *             sizeof(float*));
-    clusters[0] = (float*)  malloc(nclusters * nfeatures * sizeof(float));
-    llvm_visc_track_mem(clusters[0], nclusters * nfeatures * sizeof(float));
-
-    for (i=1; i<nclusters; i++)
-        clusters[i] = clusters[i-1] + nfeatures;
-
-    /* initialize the random clusters */
-    initial = (int *) malloc (npoints * sizeof(int));
-    for (i = 0; i < npoints; i++)
-    {
-        initial[i] = i;
-    }
-    initial_points = npoints;
-
-    /* randomly pick cluster centers */
-    for (i=0; i<nclusters && initial_points >= 0; i++) {
-        //n = (int)rand() % initial_points;
-
-        for (j=0; j<nfeatures; j++)
-            clusters[i][j] = feature[initial[n]][j];	// remapped
-
-        /* swap the selected index to the end (not really necessary,
-           could just move the end up) */
-        temp = initial[n];
-        initial[n] = initial[initial_points-1];
-        initial[initial_points-1] = temp;
-        initial_points--;
-        n++;
-    }
-
-    /* initialize the membership to -1 for all */
-    for (i=0; i < npoints; i++)
-        membership[i] = -1;
-
-    /* allocate space for and initialize new_centers_len and new_centers */
-    new_centers_len = (int*) calloc(nclusters, sizeof(int));
-
-    new_centers    = (float**) malloc(nclusters *            sizeof(float*));
-    new_centers[0] = (float*)  calloc(nclusters * nfeatures, sizeof(float));
-    for (i=1; i<nclusters; i++)
-        new_centers[i] = new_centers[i-1] + nfeatures;
-
-    /* iterate until convergence */
-    do {
-        delta = 0.0;
-        // CUDA
-        delta = (float) kmeansOCL(feature,			/* in: [npoints][nfeatures] */
-                                  nfeatures,		/* number of attributes for each point */
-                                  npoints,			/* number of data points */
-                                  nclusters,		/* number of clusters */
-                                  membership,		/* which cluster the point belongs to */
-                                  clusters,		/* out: [nclusters][nfeatures] */
-                                  new_centers_len,	/* out: number of points in each cluster */
-                                  new_centers		/* sum of points in each cluster */
-                                 );
-
-        /* replace old cluster centers with new_centers */
-        /* CPU side of reduction */
-        llvm_visc_request_mem(clusters[0], nclusters * nfeatures * sizeof(float));
-        for (i=0; i<nclusters; i++) {
-            for (j=0; j<nfeatures; j++) {
-                if (new_centers_len[i] > 0)
-                    clusters[i][j] = new_centers[i][j] / new_centers_len[i];	/* take average i.e. sum/n */
-                new_centers[i][j] = 0.0;	/* set back to 0 */
-            }
-            new_centers_len[i] = 0;			/* set back to 0 */
-        }
-        c++;
-    } while ((delta > threshold) && (loop++ < 500));	/* makes sure loop terminates */
-    printf("iterated %d times\n", c);
-    free(new_centers[0]);
-    free(new_centers);
-    free(new_centers_len);
-
-    return clusters;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/read_input.c b/hpvm/test/parboil/benchmarks/kmeans/src/visc/read_input.c
deleted file mode 100644
index 103bdd5e1e..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/read_input.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/*****************************************************************************/
-/*IMPORTANT:  READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.         */
-/*By downloading, copying, installing or using the software you agree        */
-/*to this license.  If you do not agree to this license, do not download,    */
-/*install, copy or use the software.                                         */
-/*                                                                           */
-/*                                                                           */
-/*Copyright (c) 2005 Northwestern University                                 */
-/*All rights reserved.                                                       */
-
-/*Redistribution of the software in source and binary forms,                 */
-/*with or without modification, is permitted provided that the               */
-/*following conditions are met:                                              */
-/*                                                                           */
-/*1       Redistributions of source code must retain the above copyright     */
-/*        notice, this list of conditions and the following disclaimer.      */
-/*                                                                           */
-/*2       Redistributions in binary form must reproduce the above copyright   */
-/*        notice, this list of conditions and the following disclaimer in the */
-/*        documentation and/or other materials provided with the distribution.*/
-/*                                                                            */
-/*3       Neither the name of Northwestern University nor the names of its    */
-/*        contributors may be used to endorse or promote products derived     */
-/*        from this software without specific prior written permission.       */
-/*                                                                            */
-/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS    */
-/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED      */
-/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND         */
-/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL          */
-/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,       */
-/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES          */
-/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR          */
-/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)          */
-/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,         */
-/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN    */
-/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             */
-/*POSSIBILITY OF SUCH DAMAGE.                                                 */
-/******************************************************************************/
-
-/*************************************************************************/
-/**   File:         example.c                                           **/
-/**   Description:  Takes as input a file:                              **/
-/**                 ascii  file: containing 1 data point per line       **/
-/**                 binary file: first int is the number of objects     **/
-/**                              2nd int is the no. of features of each **/
-/**                              object                                 **/
-/**                 This example performs a fuzzy c-means clustering    **/
-/**                 on the data. Fuzzy clustering is performed using    **/
-/**                 min to max clusters and the clustering that gets    **/
-/**                 the best score according to a compactness and       **/
-/**                 separation criterion are returned.                  **/
-/**   Author:  Wei-keng Liao                                            **/
-/**            ECE Department Northwestern University                   **/
-/**            email: wkliao@ece.northwestern.edu                       **/
-/**                                                                     **/
-/**   Edited by: Jay Pisharath                                          **/
-/**              Northwestern University.                               **/
-/**                                                                     **/
-/**   ================================================================  **/
-/**																		**/
-/**   Edited by: Shuai Che, David Tarjan, Sang-Ha Lee					**/
-/**				 University of Virginia									**/
-/**																		**/
-/**   Description:	No longer supports fuzzy c-means clustering;	 	**/
-/**					only regular k-means clustering.					**/
-/**					No longer performs "validity" function to analyze	**/
-/**					compactness and separation crietria; instead		**/
-/**					calculate root mean squared error.					**/
-/**                                                                     **/
-/*************************************************************************/
-#define _CRT_SECURE_NO_DEPRECATE 1
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <math.h>
-#include <fcntl.h>
-
-#include "kmeans.h"
-#include <unistd.h>
-#include <visc.h>
-
-extern double wtime(void);
-
-
-
-/*---< usage() >------------------------------------------------------------*/
-void usage(char *argv0) {
-    char *help =
-        "\nUsage: %s [switches] -i filename\n\n"
-        "    -i filename      :file containing data to be clustered\n"
-        "    -m max_nclusters :maximum number of clusters allowed    [default=5]\n"
-        "    -n min_nclusters :minimum number of clusters allowed    [default=5]\n"
-        "    -t threshold     :threshold value                       [default=0.001]\n"
-        "    -l nloops        :iteration for each number of clusters [default=1]\n"
-        "    -b               :input file is in binary format\n"
-        "    -r               :calculate RMSE                        [default=off]\n"
-        "    -o               :output cluster center coordinates     [default=off]\n";
-    fprintf(stderr, help, argv0);
-    exit(-1);
-}
-
-/*---< main() >-------------------------------------------------------------*/
-int setup(int argc, char **argv) {
-    int		opt;
-    extern char   *optarg;
-    char   *filename = 0;
-    char* outfilename = 0;
-    float  *buf;
-    char	line[1024];
-    int		isBinaryFile = 0;
-
-    float	threshold = 0.001;		/* default value */
-    int		max_nclusters=5;		/* default value */
-    int		min_nclusters=5;		/* default value */
-    int		best_nclusters = 0;
-    int		nfeatures = 0;
-    int		npoints = 0;
-    float	len;
-
-    float **features;
-    float **cluster_centres=NULL;
-    int		i, j, index;
-    int		nloops = 1;				/* default value */
-
-    int		isRMSE = 0;
-    float	rmse;
-
-    int		isOutput = 0;
-    //float	cluster_timing, io_timing;
-
-    /* obtain command line arguments and change appropriate options */
-    while ( (opt=getopt(argc,argv,"i:t:m:n:l:o:br"))!= EOF) {
-        switch (opt) {
-        case 'i':
-            filename=optarg;
-            break;
-        case 'b':
-            isBinaryFile = 1;
-            break;
-        case 't':
-            threshold=atof(optarg);
-            break;
-        case 'm':
-            max_nclusters = atoi(optarg);
-            break;
-        case 'n':
-            min_nclusters = atoi(optarg);
-            break;
-        case 'r':
-            isRMSE = 1;
-            break;
-        case 'o':
-            isOutput = 1;
-            outfilename = optarg;
-            break;
-        case 'l':
-            nloops = atoi(optarg);
-            break;
-        case '?':
-            usage(argv[0]);
-            break;
-        default:
-            usage(argv[0]);
-            break;
-        }
-    }
-
-    if (filename == 0) usage(argv[0]);
-
-    /* ============== I/O begin ==============*/
-    /* get nfeatures and npoints */
-    //io_timing = omp_get_wtime();
-    if (isBinaryFile) {		//Binary file input
-        int infile;
-        if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
-            fprintf(stderr, "Error: no such file (%s)\n", filename);
-            exit(1);
-        }
-        read(infile, &npoints,   sizeof(int));
-        read(infile, &nfeatures, sizeof(int));
-
-        /* allocate space for features[][] and read attributes of all objects */
-        buf         = (float*) malloc(npoints*nfeatures*sizeof(float));
-        features    = (float**)malloc(npoints*          sizeof(float*));
-        features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
-        for (i=1; i<npoints; i++)
-            features[i] = features[i-1] + nfeatures;
-        
-
-        read(infile, buf, npoints*nfeatures*sizeof(float));
-
-        close(infile);
-    }
-    else {
-        FILE *infile;
-        if ((infile = fopen(filename, "r")) == NULL) {
-            fprintf(stderr, "Error: no such file (%s)\n", filename);
-            exit(1);
-        }
-        while (fgets(line, 1024, infile) != NULL)
-            if (strtok(line, " \t\n") != 0)
-                npoints++;
-        rewind(infile);
-        while (fgets(line, 1024, infile) != NULL) {
-            if (strtok(line, " \t\n") != 0) {
-                /* ignore the id (first attribute): nfeatures = 1; */
-                while (strtok(NULL, " ,\t\n") != NULL) nfeatures++;
-                break;
-            }
-        }
-
-        /* allocate space for features[] and read attributes of all objects */
-        buf         = (float*) malloc(npoints*nfeatures*sizeof(float));
-        features    = (float**)malloc(npoints*          sizeof(float*));
-        features[0] = (float*) malloc(npoints*nfeatures*sizeof(float));
-        for (i=1; i<npoints; i++)
-            features[i] = features[i-1] + nfeatures;
-        rewind(infile);
-        i = 0;
-        while (fgets(line, 1024, infile) != NULL) {
-            if (strtok(line, " \t\n") == NULL) continue;
-            for (j=0; j<nfeatures; j++) {
-                buf[i] = atof(strtok(NULL, " ,\t\n"));
-                i++;
-            }
-        }
-        fclose(infile);
-    }
-    //io_timing = omp_get_wtime() - io_timing;
-    llvm_visc_track_mem(features[0], npoints*nfeatures*sizeof(float));
-
-    printf("\nI/O completed\n");
-    printf("\nNumber of objects: %d\n", npoints);
-    printf("Number of features: %d\n", nfeatures);
-    /* ============== I/O end ==============*/
-
-    // error check for clusters
-    if (npoints < min_nclusters)
-    {
-        printf("Error: min_nclusters(%d) > npoints(%d) -- cannot proceed\n", min_nclusters, npoints);
-        exit(0);
-    }
-
-    srand(7);												/* seed for future random number generator */
-    memcpy(features[0], buf, npoints*nfeatures*sizeof(float)); /* now features holds 2-dimensional array of features */
-    free(buf);
-
-    /* ======================= core of the clustering ===================*/
-
-    //cluster_timing = omp_get_wtime();		/* Total clustering time */
-    cluster_centres = NULL;
-    index = cluster(npoints,				/* number of data points */
-                    nfeatures,				/* number of features for each point */
-                    features,				/* array: [npoints][nfeatures] */
-                    min_nclusters,			/* range of min to max number of clusters */
-                    max_nclusters,
-                    threshold,				/* loop termination factor */
-                    &best_nclusters,			/* return: number between min and max */
-                    &cluster_centres,		/* return: [best_nclusters][nfeatures] */
-                    &rmse,					/* Root Mean Squared Error */
-                    isRMSE,					/* calculate RMSE */
-                    nloops);				/* number of iteration for each number of clusters */
-
-    //cluster_timing = omp_get_wtime() - cluster_timing;
-
-
-    /* =============== Command Line Output =============== */
-
-    /* cluster center coordinates
-       :displayed only for when k=1*/
-
-    //printf("Input file = %s\n", filename);
-    //printf("Output file = %s\n", outfilename);
-    if((min_nclusters == max_nclusters) && (isOutput == 1)) {
-        FILE *outfile;
-        if ((outfile = fopen(outfilename, "w")) == NULL) {
-            fprintf(stderr, "Error: no such file (%s)\n", outfilename);
-            exit(1);
-        }
-        fwrite(&max_nclusters, sizeof(int), 1, outfile);
-        fwrite(&nfeatures, sizeof(int), 1, outfile);
-        fwrite(&cluster_centres[0][0], sizeof(float), max_nclusters*nfeatures, outfile);
-        fclose(outfile);
-        /*printf("\n================= Centroid Coordinates =================\n");*/
-        /*for(i = 0; i < max_nclusters; i++) {*/
-            /*printf("%d:", i);*/
-            /*for(j = 0; j < nfeatures; j++) {*/
-                /*printf(" %.2f", cluster_centres[i][j]);*/
-            /*}*/
-            /*printf("\n\n");*/
-        /*}*/
-    }
-
-    len = (float) ((max_nclusters - min_nclusters + 1)*nloops);
-
-    //printf("Time for I/O: %.5fsec\n", io_timing);
-    //printf("Time for Entire Clustering: %.5fsec\n", cluster_timing);
-
-    if(min_nclusters != max_nclusters) {
-        if(nloops != 1) {									//range of k, multiple iteration
-            //printf("Average Clustering Time: %fsec\n",
-            //		cluster_timing / len);
-            printf("Best number of clusters is %d\n", best_nclusters);
-        }
-        else {												//range of k, single iteration
-            //printf("Average Clustering Time: %fsec\n",
-            //		cluster_timing / len);
-            printf("Best number of clusters is %d\n", best_nclusters);
-        }
-    }
-    else {
-        if(nloops != 1) {									// single k, multiple iteration
-            //printf("Average Clustering Time: %.5fsec\n",
-            //		cluster_timing / nloops);
-            if(isRMSE)										// if calculated RMSE
-                printf("Number of trials to approach the best RMSE of %.3f is %d\n", rmse, index + 1);
-        }
-        else {												// single k, single iteration
-            if(isRMSE)										// if calculated RMSE
-                printf("Root Mean Squared Error: %.3f\n", rmse);
-        }
-    }
-
-
-    /* free up memory */
-    free(features[0]);
-    free(features);
-    return(0);
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/rmse.c b/hpvm/test/parboil/benchmarks/kmeans/src/visc/rmse.c
deleted file mode 100644
index 0029c64dc7..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/rmse.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*************************************************************************/
-/**   File:         rmse.c												**/
-/**   Description:  calculate root mean squared error of particular     **/
-/**                 clustering.											**/
-/**   Author:  Sang-Ha Lee												**/
-/**            University of Virginia.									**/
-/**																		**/
-/**   Note: euclid_dist_2() and find_nearest_point() adopted from       **/
-/**			Minebench code.												**/
-/**                                                                     **/
-/*************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <math.h>
-
-
-#include "kmeans.h"
-
-extern double wtime(void);
-
-/*----< euclid_dist_2() >----------------------------------------------------*/
-/* multi-dimensional spatial Euclid distance square */
-__inline
-float euclid_dist_2(float *pt1,
-                    float *pt2,
-                    int    numdims)
-{
-    int i;
-    float ans=0.0;
-
-    for (i=0; i<numdims; i++)
-        ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
-
-    return(ans);
-}
-
-/*----< find_nearest_point() >-----------------------------------------------*/
-__inline
-int find_nearest_point(float  *pt,          /* [nfeatures] */
-                       int     nfeatures,
-                       float  **pts,         /* [npts][nfeatures] */
-                       int     npts)
-{
-    int index, i;
-    float max_dist=FLT_MAX;
-
-    /* find the cluster center id with min distance to pt */
-    for (i=0; i<npts; i++) {
-        float dist;
-        dist = euclid_dist_2(pt, pts[i], nfeatures);  /* no need square root */
-        if (dist < max_dist) {
-            max_dist = dist;
-            index    = i;
-        }
-    }
-    return(index);
-}
-
-/*----< rms_err(): calculates RMSE of clustering >-------------------------------------*/
-float rms_err	(float **feature,         /* [npoints][nfeatures] */
-                 int     nfeatures,
-                 int     npoints,
-                 float **cluster_centres, /* [nclusters][nfeatures] */
-                 int     nclusters)
-{
-    int    i;
-    int	   nearest_cluster_index;	/* cluster center id with min distance to pt */
-    float  sum_euclid = 0.0;		/* sum of Euclidean distance squares */
-    float  ret;						/* return value */
-
-    /* calculate and sum the sqaure of euclidean distance*/
-    #pragma omp parallel for \
-    shared(feature,cluster_centres) \
-    firstprivate(npoints,nfeatures,nclusters) \
-    private(i, nearest_cluster_index) \
-    schedule (static)
-    for (i=0; i<npoints; i++) {
-        nearest_cluster_index = find_nearest_point(feature[i],
-                                nfeatures,
-                                cluster_centres,
-                                nclusters);
-
-        sum_euclid += euclid_dist_2(feature[i],
-                                    cluster_centres[nearest_cluster_index],
-                                    nfeatures);
-
-    }
-    /* divide by n, then take sqrt */
-    ret = sqrt(sum_euclid / npoints);
-
-    return(ret);
-}
-
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/run b/hpvm/test/parboil/benchmarks/kmeans/src/visc/run
deleted file mode 100755
index 9fcc65554d..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/run
+++ /dev/null
@@ -1 +0,0 @@
-./kmeans -o -i ../../../../datasets/kmeans/kdd_cup 
diff --git a/hpvm/test/parboil/benchmarks/kmeans/src/visc/unistd.h b/hpvm/test/parboil/benchmarks/kmeans/src/visc/unistd.h
deleted file mode 100644
index ff334e56eb..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/src/visc/unistd.h
+++ /dev/null
@@ -1,945 +0,0 @@
-/* IBM_PROLOG_BEGIN_TAG                                                   */
-/* This is an automatically generated prolog.                             */
-/*                                                                        */
-/* bos53H src/bos/usr/include/unistd.h 1.38.4.46                          */
-/*                                                                        */
-/* Licensed Materials - Property of IBM                                   */
-/*                                                                        */
-/* (C) COPYRIGHT International Business Machines Corp. 1985,1995          */
-/* All Rights Reserved                                                    */
-/*                                                                        */
-/* US Government Users Restricted Rights - Use, duplication or            */
-/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.      */
-/*                                                                        */
-/* IBM_PROLOG_END_TAG                                                     */
-/* @(#)82     1.38.4.46  src/bos/usr/include/unistd.h, incstd, bos53H, h2006_17B8 4/25/06 11:53:09 */
-/*
- * COMPONENT_NAME: (INCSTD) Standard Include Files
- *
- * FUNCTIONS:
- *
- * ORIGINS: 3 27
- *
- * (C) COPYRIGHT International Business Machines Corp. 1985, 2006
- * All Rights Reserved
- * Licensed Materials - Property of IBM
- *
- * US Government Users Restricted Rights - Use, duplication or
- * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
- *
- * Copyright (c) 1984 AT&T
- * All Rights Reserved
- *
- * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T
- * The copyright notice above does not evidence any
- * actual or intended publication of such source code.
- */
-
-#ifndef _H_UNISTD
-#define _H_UNISTD
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _H_STANDARDS
-#include <standards.h>
-#endif
-
-#include <strict_stdtypes.h>
-
-#ifndef _H_TYPES
-#include <sys/types.h>
-#endif
-
-#include <end_strict_stdtypes.h>
-
-#ifndef _H_ACCESS
-#include <sys/access.h>	/* for the "access" function */
-#endif
-
-/*
- * POSIX requires that certain values be included in unistd.h.  It also
- * requires that when _POSIX_SOURCE is defined only those standard
- * specific values are present.  This header includes all the POSIX
- * required entries.
- */
-
-#ifdef _POSIX_SOURCE
-#ifdef _LARGE_FILES
-#define lseek lseek64
-#endif
-
-
-/* Symbolic constants for the "lseek" function: */
-#ifndef SEEK_SET
-#define SEEK_SET 0	/* Set file pointer to "offset" */
-#define SEEK_CUR 1	/* Set file pointer to current plus "offset" */
-#define SEEK_END 2	/* Set file pointer to EOF plus "offset" */
-#endif /* SEEK_SET */
-
-#ifdef _NO_PROTO
-
-#ifndef _KERNEL
-extern int access();
-extern unsigned int alarm();
-extern int chdir();
-extern int chown();
-extern int close();
-extern char *ctermid();
-extern int dup();
-extern int dup2();
-extern int execl();
-extern int execv();
-extern int execle();
-extern int execve();
-extern int execlp();
-extern int execvp();
-extern void _exit();
-extern pid_t fork();
-extern long fpathconf();
-extern char *getcwd();
-extern gid_t getegid();
-extern uid_t geteuid();
-extern gid_t getgid();
-extern int getgroups();
-extern char *getlogin();
-extern pid_t getpgrp();
-extern pid_t getpid();
-extern pid_t getppid();
-extern uid_t getuid();
-extern int isatty();
-extern int link();
-extern off_t lseek();
-extern long pathconf();
-extern int pause();
-extern int pipe();
-#if defined(_XOPEN_SOURCE) && ( _XOPEN_SOURCE >= 500 )
-extern int pthread_atfork();
-#endif
-extern int read();
-extern int rmdir();
-extern int setgid();
-extern int setpgid();
-extern int setsid();
-extern int setuid();
-extern unsigned int sleep();
-extern long sysconf();
-extern pid_t tcgetpgrp();
-extern int tcsetpgrp();
-extern char *ttyname();
-extern int unlink();
-extern int write();
-#endif		/* !_KERNEL	*/
-
-#else		/* POSIX required prototypes */
-
-#ifndef _KERNEL
-extern int access(const char *, int);
-extern unsigned int alarm(unsigned int);
-extern int chdir(const char *);
-extern int chown(const char *, uid_t, gid_t);
-extern int close(int);
-extern char *ctermid(char *);
-extern int dup(int);
-extern int dup2(int, int);
-extern int execl(const char *, const char *, ...);
-extern int execv(const char *, char *const []);
-extern int execle(const char *, const char *, ...);
-extern int execve(const char *, char *const [], char *const []);
-extern int execlp(const char *, const char *, ...);
-extern int execvp(const char *, char *const []);
-extern void _exit(int);
-extern pid_t fork(void);
-extern long fpathconf(int, int);
-extern char *getcwd(char *, size_t);
-extern gid_t getegid(void);
-extern uid_t geteuid(void);
-extern gid_t getgid(void);
-extern int getgroups(int, gid_t []);
-extern char *getlogin(void);
-#ifndef _BSD
-extern pid_t getpgrp(void);
-#endif /* _BSD */
-extern pid_t getpid(void);
-extern pid_t getppid(void);
-extern uid_t getuid(void);
-extern int isatty(int);
-extern int link(const char *, const char *);
-extern off_t lseek(int, off_t, int);
-#ifdef _LARGE_FILE_API
-extern off64_t	lseek64(int, off64_t, int);
-#endif
-extern long pathconf(const char *, int);
-extern int pause(void);
-extern int pipe(int []);
-#if defined(_XOPEN_SOURCE) && ( _XOPEN_SOURCE >= 500 )
-extern int pthread_atfork(void (*)(void), void (*)(void), void (*)(void));
-#endif
-extern ssize_t read(int, void *, size_t);
-extern int rmdir(const char *);
-extern int setgid(gid_t);
-extern int setpgid(pid_t, pid_t);
-extern pid_t setsid(void);
-extern int setuid(uid_t);
-extern unsigned int sleep(unsigned int);
-extern long sysconf(int);
-extern pid_t tcgetpgrp(int);
-extern int tcsetpgrp(int, pid_t);
-extern char *ttyname(int);
-extern int unlink(const char *);
-extern ssize_t write(int, const void *, size_t);
-#endif		/* !_KERNEL	*/
-#endif		/* !_NO_PROTO	*/
-
-#define STDIN_FILENO	0
-#define STDOUT_FILENO	1
-#define STDERR_FILENO	2
-
-#define _POSIX_JOB_CONTROL	1
-#define _POSIX_SAVED_IDS	1
-
-#define _POSIX_VERSION		200112L
-#define _POSIX2_VERSION		200112L
-#define _POSIX2_C_VERSION	200112L
-
-
-#ifdef _XOPEN_SOURCE
-
-#define _XOPEN_VERSION		600
-#define _XOPEN_XCU_VERSION	4
-#define _XOPEN_XPG3		1
-#define _XOPEN_XPG4		1
-#define _XOPEN_UNIX		1
-
-#define _XOPEN_REALTIME		(-1)
-#define _XOPEN_REALTIME_THREADS	(-1)
-
-#if (_XOPEN_SOURCE >= 600)
-#define _XOPEN_STREAMS		1
-#endif
-
-#define _XBS5_ILP32_OFF32	1
-#define _XBS5_ILP32_OFFBIG	1
-#define _XBS5_LP64_OFF64	1
-#define _XBS5_LPBIG_OFFBIG	1
-
-#define _POSIX2_C_BIND		200112L
-#define _POSIX2_C_DEV		200112L
-#define _POSIX2_CHAR_TERM	1
-#define _POSIX2_LOCALEDEF	200112L
-#define _POSIX2_UPE		200112L
-#define _POSIX2_FORT_DEV	(-1)
-#define _POSIX2_FORT_RUN	(-1)
-#define _POSIX2_SW_DEV		(-1)
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _POSIX_REGEXP         1
-#define _POSIX_SHELL          1
-#define _POSIX2_PBS           (-1)
-#define _POSIX2_PBS_ACCOUNTING        (-1)
-#define _POSIX2_PBS_CHECKPOINT        (-1)
-#define _POSIX2_PBS_LOCATE    (-1)
-#define _POSIX2_PBS_MESSAGE   (-1)
-#define _POSIX2_PBS_TRACK     (-1)
-#define _V6_ILP32_OFF32               1
-#define _V6_ILP32_OFFBIG      1
-#define _V6_LP64_OFF64                1
-#define _V6_LPBIG_OFFBIG      1
-
-#define _POSIX_ADVISORY_INFO   200112L
-#define _POSIX_BARRIERS        200112L
-#define _POSIX_CLOCK_SELECTION 200112L
-#define _POSIX_CPUTIME         200112L
-#define _POSIX_MONOTONIC_CLOCK 200112L
-
-#ifdef _POSIX_RAW_SOCKETS
-#undef _POSIX_RAW_SOCKETS
-#endif
-
-#define _POSIX_SPAWN           200112L
-#define _POSIX_SPIN_LOCKS      200112L
-#define _POSIX_SPORADIC_SERVER (-1)
-#define _POSIX_THREAD_CPUTIME  200112L
-#define _POSIX_THREAD_SPORADIC_SERVER (-1)
-#define _POSIX_TIMEOUTS	200112L
-#define _POSIX_TRACE           (-1)
-#define _POSIX_TRACE_EVENT_FILTER     (-1)
-#define _POSIX_TRACE_INHERIT   (-1)
-#define _POSIX_TRACE_LOG       (-1)
-#define _POSIX_TYPED_MEMORY_OBJECTS   (-1)
-
-#endif /* _POSIX_C_SOURCE >= 200112L */
-
-#define _XOPEN_CRYPT		1
-#define _XOPEN_SHM		1
-#define _XOPEN_ENH_I18N		1
-#define _XOPEN_LEGACY		(-1)
-#ifndef __64BIT__
-#define _UNIX_ABI		(-1)
-#define _UNIX_ABI_IA64		(-1)
-#define _UNIX_ABI_BIG_ENDIAN	(-1)
-#define _UNIX_ABI_LITTLE_ENDIAN	(-1)
-#endif /* __64BIT__ */
-
-extern  char    *optarg;
-extern  int     optind, opterr, optopt;
-
-#ifdef _NO_PROTO
-extern	size_t	confstr();
-extern  char    *crypt();
-extern  void    encrypt();
-extern  int     fsync();
-extern	int	getopt();
-extern	int	nice();
-extern  void    swab();
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern  char    *getpass();
-extern  int     chroot();
-#endif
-#else
-extern	size_t	confstr(int, char*, size_t);
-extern  char    *crypt(const char *, const char *);
-extern  void    encrypt(char *, int);
-extern  int     fsync(int);
-extern	int	getopt(int, char* const*, const char*);
-extern	int	nice(int);
-extern  void    swab(const void *, void *, ssize_t);
-extern int	fdatasync(int);
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern  char    *getpass(const char *);
-extern  int     chroot(const char *);
-#endif
-#endif
-
-#endif /* _XOPEN _SOURCE */
-
-/* Threads options for 1003.1c and XPG UNIX98 */
-#define _POSIX_THREADS				200112L
-#define _POSIX_THREAD_ATTR_STACKADDR            200112L
-#define _POSIX_THREAD_ATTR_STACKSIZE		200112L
-#define _POSIX_THREAD_PROCESS_SHARED		200112L
-#define _POSIX_THREAD_SAFE_FUNCTIONS		200112L
-#ifdef _ALL_SOURCE
-#define _POSIX_REENTRANT_FUNCTIONS		_POSIX_THREAD_SAFE_FUNCTIONS
-#endif
-
-/* Realtime threads options for 1003.1c and XPG UNIX98 */
-#define	 _POSIX_THREAD_PRIORITY_SCHEDULING	(-1)
-#define	 _POSIX_THREAD_PRIO_INHERIT		(-1)
-#define	 _POSIX_THREAD_PRIO_PROTECT		(-1)
-
-#undef  _POSIX_THREAD_FORKALL
-
-/* Realtime options for 1003.1c and XPG UNIX98 */
-#define _POSIX_ASYNCHRONOUS_IO			200112L
-#define _POSIX_FSYNC				200112L
-#define _POSIX_MAPPED_FILES			200112L
-#define _POSIX_MEMLOCK			        200112L
-#define _POSIX_MEMLOCK_RANGE		        200112L
-#define _POSIX_MEMORY_PROTECTION		200112L
-#define _POSIX_MESSAGE_PASSING			200112L
-#define _POSIX_PRIORITIZED_IO			200112L
-#define _POSIX_PRIORITY_SCHEDULING		200112L
-#define _POSIX_REALTIME_SIGNALS			200112L
-#define _POSIX_SEMAPHORES			200112L
-#define _POSIX_SHARED_MEMORY_OBJECTS            200112L
-#define _POSIX_SYNCHRONIZED_IO			200112L
-#define _POSIX_TIMERS				200112L
-
-#define _POSIX_ASYNC_IO				(-1)
-#undef	_POSIX_SYNC_IO
-#define _POSIX_PRIO_IO				(-1)
-
-#define _POSIX_CHOWN_RESTRICTED	 0
-#define _POSIX_VDISABLE		 0xFF
-#define _POSIX_NO_TRUNC		 0
-
-/* UNIX03 and POSIX01 */
-/* Always enabled */
-#define _POSIX_IPV6				200112L
-#define _POSIX_RAW_SOCKETS			200112L
-
-
-#ifndef NULL
-#define NULL	0
-#endif
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _POSIX_READER_WRITER_LOCKS            200112L
-#endif
-
-/* arguments for the confstr() function */
-
-#define _CS_PATH	1
-
-/* compile,link,lib,lint flags for 32bit, no_LARGE_FILES system */
-#define _CS_XBS5_ILP32_OFF32_CFLAGS	2
-#define _CS_XBS5_ILP32_OFF32_LDFLAGS	3
-#define _CS_XBS5_ILP32_OFF32_LIBS	4
-#define _CS_XBS5_ILP32_OFF32_LINTFLAGS	5
-
-/* compile,link,lib,lint flags for 32bit, _LARGE_FILES system */
-#define _CS_XBS5_ILP32_OFFBIG_CFLAGS	6
-#define _CS_XBS5_ILP32_OFFBIG_LDFLAGS	7
-#define _CS_XBS5_ILP32_OFFBIG_LIBS	8
-#define _CS_XBS5_ILP32_OFFBIG_LINTFLAGS	9
-
-/* compile,link,lib,lint flags for LP64 64bit system */
-#define _CS_XBS5_LP64_OFF64_CFLAGS	10
-#define _CS_XBS5_LP64_OFF64_LDFLAGS	11
-#define _CS_XBS5_LP64_OFF64_LIBS	12
-#define _CS_XBS5_LP64_OFF64_LINTFLAGS	13
-
-/* compile,link,lib,lint flags for ILP64 64bit system */
-/* AIX does not currently support this */
-#define _CS_XBS5_LPBIG_OFFBIG_CFLAGS	14
-#define _CS_XBS5_LPBIG_OFFBIG_LDFLAGS	15
-#define _CS_XBS5_LPBIG_OFFBIG_LIBS	16
-#define _CS_XBS5_LPBIG_OFFBIG_LINTFLAGS	17
-
-#define _CS_AIX_BOOTDEV				24
-#define _CS_AIX_MODEL_CODE			25
-#define _CS_AIX_ARCHITECTURE			26
-#define _CS_AIX_MODEL_CLASS			40
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _CS_POSIX_V6_ILP32_OFF32_CFLAGS		27
-#define _CS_POSIX_V6_ILP32_OFF32_LDFLAGS	28
-#define _CS_POSIX_V6_ILP32_OFF32_LIBS		29
-#define _CS_POSIX_V6_ILP32_OFFBIG_CFLAGS	30
-#define _CS_POSIX_V6_ILP32_OFFBIG_LDFLAGS	31
-#define _CS_POSIX_V6_ILP32_OFFBIG_LIBS		32
-#define _CS_POSIX_V6_LP64_OFF64_CFLAGS		33
-#define _CS_POSIX_V6_LP64_OFF64_LDFLAGS		34
-#define _CS_POSIX_V6_LP64_OFF64_LIBS		35
-#define _CS_POSIX_V6_LPBIG_OFFBIG_CFLAGS	36
-#define _CS_POSIX_V6_LPBIG_OFFBIG_LDFLAGS	37
-#define _CS_POSIX_V6_LPBIG_OFFBIG_LIBS		38
-#define _CS_POSIX_V6_WIDTH_RESTRICTED_ENVS      39
-#endif
-
-/* Values for the above */
-#define _CSPATH		"/usr/bin:/usr/vac/bin"
-
-/* ILP32_OFF32 */
-#define _CSPOSIX_V6_ILP32_OFF32_CFLAGS	"-q32"
-#define _CSXBS5_ILP32_OFF32_CFLAGS	_CSPOSIX_V6_ILP32_OFF32_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_ILP32_OFF32_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_ILP32_OFF32_LDFLAGS "-b32"
-#define _CSXBS5_ILP32_OFF32_LDFLAGS	_CSPOSIX_V6_ILP32_OFF32_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_ILP32_OFF32_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_ILP32_OFF32_LIBS	_CSPOSIX_V6_ILP32_OFF32_LIBS
-
-#define _CSXBS5_ILP32_OFF32_LINTFLAGS	""
-
-/* ILP32_OFFOFFBIG */
-#define _CSPOSIX_V6_ILP32_OFFBIG_CFLAGS "-q32 -D_LARGE_FILES -qlonglong"
-#define _CSXBS5_ILP32_OFFBIG_CFLAGS	_CSPOSIX_V6_ILP32_OFFBIG_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_ILP32_OFFBIG_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_ILP32_OFFBIG_LDFLAGS "-b32"
-#define _CSXBS5_ILP32_OFFBIG_LDFLAGS	_CSPOSIX_V6_ILP32_OFFBIG_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_ILP32_OFFBIG_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_ILP32_OFFBIG_LIBS	_CSPOSIX_V6_ILP32_OFFBIG_LIBS
-
-#define _CSXBS5_ILP32_OFFBIG_LINTFLAGS	"-D_LARGE_FILES -qlonglong"
-
-/* LP64_OFF64 */
-#define _CSPOSIX_V6_LP64_OFF64_CFLAGS	"-q64"
-#define _CSXBS5_LP64_OFF64_CFLAGS	_CSPOSIX_V6_LP64_OFF64_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_LP64_OFF64_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_LP64_OFF64_LDFLAGS	"-b64"
-#define _CSXBS5_LP64_OFF64_LDFLAGS	_CSPOSIX_V6_LP64_OFF64_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_LP64_OFF64_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_LP64_OFF64_LIBS		_CSPOSIX_V6_LP64_OFF64_LIBS
-
-#define _CSXBS5_LP64_OFF64_LINTFLAGS	"-D__64BIT__"
-
-/* LPBIG_OFFBIG */
-#define _CSPOSIX_V6_LPBIG_OFFBIG_CFLAGS "-q64"
-#define _CSXBS5_LPBIG_OFFBIG_CFLAGS	_CSPOSIX_V6_LPBIG_OFFBIG_CFLAGS
-
-#ifdef __ia64
-#define _CSXBS5_LPBIG_OFFBIG_LDFLAGS	""
-#else /* POWER */
-#define _CSPOSIX_V6_LPBIG_OFFBIG_LDFLAGS "-b64"
-#define _CSXBS5_LPBIG_OFFBIG_LDFLAGS	_CSPOSIX_V6_LPBIG_OFFBIG_LDFLAGS
-#endif
-
-#define _CSPOSIX_V6_LPBIG_OFFBIG_LIBS	"-lc -lpthread -lm"
-#define _CSXBS5_LPBIG_OFFBIG_LIBS	_CSPOSIX_V6_LPBIG_OFFBIG_LIBS
-
-#define _CSXBS5_LPBIG_OFFBIG_LINTFLAGS	"-D__64BIT__"
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _CSPOSIX_V6_WIDTH_RESTRICTED_ENVS \
-		"POSIX_V6_ILP32_OFF32\n"  \
-		"POSIX_V6_ILP32_OFFBIG\n" \
-		"POSIX_V6_LP64_OFF64\n"  \
-		"POSIX_V6_LPBIG_OFFBIG"
-#endif
-
-/* arguments for the pathconf() function */
-
-#define _PC_CHOWN_RESTRICTED	10
-#define _PC_LINK_MAX		11
-#define _PC_MAX_CANON		12
-#define _PC_MAX_INPUT		13
-#define _PC_NAME_MAX		14
-#define _PC_NO_TRUNC		15
-#define _PC_PATH_MAX		16
-#define _PC_PIPE_BUF		17
-#define _PC_VDISABLE		18
-#define _PC_ASYNC_IO		19
-#define _PC_SYNC_IO		20
-#define _PC_PRIO_IO		21
-#define _PC_FILESIZEBITS	22  /* # bits needed to hold offset */
-#define _PC_AIX_DISK_PARTITION	23
-#define _PC_AIX_DISK_SIZE	24
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _PC_SYMLINK_MAX         25
-#define _PC_ALLOC_SIZE_MIN      26
-#define _PC_REC_INCR_XFER_SIZE  27
-#define _PC_REC_MAX_XFER_SIZE   28
-#define _PC_REC_MIN_XFER_SIZE   29
-#define _PC_REC_XFER_ALIGN      30
-#define _PC_2_SYMLINKS          31
-#endif
-
-/* arguments for the sysconf() function, the defined numbers are used as
- * array index in sysconf().
- *
- * POSIX.1(1990), Table 4-2
- */
-#define _SC_ARG_MAX			0
-#define _SC_CHILD_MAX			1
-#define _SC_CLK_TCK			2
-#define _SC_NGROUPS_MAX			3
-#define _SC_OPEN_MAX			4
-#define _SC_STREAM_MAX			5
-#define _SC_TZNAME_MAX			6
-#define _SC_JOB_CONTROL			7
-#define _SC_SAVED_IDS			8
-#define _SC_VERSION			9
-
-/* POSIX.1(1990), Table 2-3, required by command getconf */
-
-#define _SC_POSIX_ARG_MAX		10
-#define _SC_POSIX_CHILD_MAX		11
-#define _SC_POSIX_LINK_MAX		12
-#define _SC_POSIX_MAX_CANON		13
-#define _SC_POSIX_MAX_INPUT		14
-#define _SC_POSIX_NAME_MAX		15
-#define _SC_POSIX_NGROUPS_MAX		16
-#define _SC_POSIX_OPEN_MAX		17
-#define _SC_POSIX_PATH_MAX		18
-#define _SC_POSIX_PIPE_BUF		19
-#define _SC_POSIX_SSIZE_MAX		20
-#define _SC_POSIX_STREAM_MAX		21
-#define _SC_POSIX_TZNAME_MAX		22
-
-/* POSIX.2 (Draft 10), Table 41)	*/
-
-#define _SC_BC_BASE_MAX			23
-#define _SC_BC_DIM_MAX			24
-#define _SC_BC_SCALE_MAX		25
-#define _SC_BC_STRING_MAX		26
-#define _SC_EQUIV_CLASS_MAX		27
-#define _SC_EXPR_NEST_MAX		28
-#define _SC_LINE_MAX			29
-#define _SC_RE_DUP_MAX			30
-#define _SC_2_VERSION			31
-#define _SC_2_C_DEV			32
-#define _SC_2_FORT_DEV			33
-#define _SC_2_FORT_RUN			34
-#define _SC_2_LOCALEDEF			35
-#define _SC_2_SW_DEV			36
-
-/* POSIX.2 (Draft 10), Table 13)	*/
-
-#define _SC_POSIX2_BC_BASE_MAX		37
-#define _SC_POSIX2_BC_DIM_MAX		38
-#define _SC_POSIX2_BC_SCALE_MAX		39
-#define _SC_POSIX2_BC_STRING_MAX	40
-#define _SC_POSIX2_EQUIV_CLASS_MAX	41
-#define _SC_POSIX2_EXPR_NEST_MAX	42
-#define _SC_POSIX2_LINE_MAX		43
-#define _SC_POSIX2_RE_DUP_MAX		44
-#define _SC_PASS_MAX			45
-#define _SC_XOPEN_VERSION		46
-#define _SC_ATEXIT_MAX			47
-#if _XOPEN_SOURCE_EXTENDED==1
-#define _SC_PAGE_SIZE			48
-#endif /* _XOPEN_SOURCE_EXTENDED */
-#define _SC_AES_OS_VERSION		49
-#define _SC_COLL_WEIGHTS_MAX		50
-#define _SC_2_C_BIND			51
-#define _SC_2_C_VERSION			52
-#define _SC_2_UPE			53
-#define _SC_2_CHAR_TERM			54
-#define _SC_XOPEN_SHM			55
-#define _SC_XOPEN_CRYPT			56
-#define _SC_XOPEN_ENH_I18N		57
-#if _XOPEN_SOURCE_EXTENDED==1
-#define _SC_PAGESIZE			_SC_PAGE_SIZE
-#define _SC_IOV_MAX			58
-#endif /* _XOPEN_SOURCE_EXTENDED */
-#define _SC_THREAD_SAFE_FUNCTIONS	59
-#define _SC_THREADS			60
-#define _SC_THREAD_ATTR_STACKADDR	61
-#define _SC_THREAD_ATTR_STACKSIZE	62
-#define _SC_THREAD_FORKALL		63
-#define _SC_THREAD_PRIORITY_SCHEDULING	64
-#define _SC_THREAD_PRIO_INHERIT		65
-#define _SC_THREAD_PRIO_PROTECT		66
-#define _SC_THREAD_PROCESS_SHARED	67
-#define _SC_THREAD_KEYS_MAX		68
-#define _SC_THREAD_DATAKEYS_MAX		_SC_THREAD_KEYS_MAX
-#define _SC_THREAD_STACK_MIN		69
-#define _SC_THREAD_THREADS_MAX		70
-#ifdef _ALL_SOURCE
-#define _SC_NPROCESSORS_CONF		71
-#define _SC_NPROCESSORS_ONLN		72
-#endif /* _ALL_SOURCE */
-#define _SC_XOPEN_UNIX			73
-
-#if (_XOPEN_SOURCE >= 500)
-
-/* POSIX 1003.1c and XPG UNIX98 */
-/* look to defines above for meanings */
-#define _SC_AIO_LISTIO_MAX			75
-#define _SC_AIO_MAX				76
-#define _SC_AIO_PRIO_DELTA_MAX			77
-#define _SC_ASYNCHRONOUS_IO			78
-#define _SC_DELAYTIMER_MAX			79
-#define _SC_FSYNC				80
-#define _SC_GETGR_R_SIZE_MAX			81
-#define _SC_GETPW_R_SIZE_MAX			82
-#define _SC_LOGIN_NAME_MAX			83
-#define _SC_MAPPED_FILES			84
-#define _SC_MEMLOCK				85
-#define _SC_MEMLOCK_RANGE			86
-#define _SC_MEMORY_PROTECTION			87
-#define _SC_MESSAGE_PASSING			88
-#define _SC_MQ_OPEN_MAX				89
-#define _SC_MQ_PRIO_MAX				90
-#define _SC_PRIORITIZED_IO			91
-#define _SC_PRIORITY_SCHEDULING			92
-#define _SC_REALTIME_SIGNALS			93
-#define _SC_RTSIG_MAX				94
-#define _SC_SEMAPHORES				95
-#define _SC_SEM_NSEMS_MAX			96
-#define _SC_SEM_VALUE_MAX			97
-#define _SC_SHARED_MEMORY_OBJECTS		98
-#define _SC_SIGQUEUE_MAX			99
-#define _SC_SYNCHRONIZED_IO			100
-#define _SC_THREAD_DESTRUCTOR_ITERATIONS	101
-#define _SC_TIMERS				102
-#define _SC_TIMER_MAX				103
-#define _SC_TTY_NAME_MAX			104
-#define _SC_XBS5_ILP32_OFF32			105
-#define _SC_XBS5_ILP32_OFFBIG			106
-#define _SC_XBS5_LP64_OFF64			107
-#define _SC_XBS5_LPBIG_OFFBIG			108
-#define _SC_XOPEN_XCU_VERSION			109
-#define _SC_XOPEN_REALTIME			110
-#define _SC_XOPEN_REALTIME_THREADS		111
-#define _SC_XOPEN_LEGACY			112
-#endif /* _XOPEN_SOURCE >= 500 */
-
-#ifdef _ALL_SOURCE
-#define _SC_REENTRANT_FUNCTIONS		_SC_THREAD_SAFE_FUNCTIONS
-#define _SC_PHYS_PAGES				113
-#define _SC_AVPHYS_PAGES			114
-#define _SC_LPAR_ENABLED			115
-#define _SC_LARGE_PAGESIZE			116
-#endif /* _ALL_SOURCE */
-
-#define _SC_AIX_KERNEL_BITMODE			117
-#define _SC_AIX_REALMEM				118
-#define _SC_AIX_HARDWARE_BITMODE		119
-#define _SC_AIX_MP_CAPABLE			120
-
-#define _SC_V6_ILP32_OFF32			121
-#define _SC_V6_ILP32_OFFBIG			122
-#define _SC_V6_LP64_OFF64			123
-#define _SC_V6_LPBIG_OFFBIG			124
-
-#define _SC_XOPEN_STREAMS			125
-
-#if (_POSIX_C_SOURCE >= 200112L)
-#define _SC_HOST_NAME_MAX			126
-#define _SC_REGEXP				127
-#define _SC_SHELL				128
-#define _SC_SYMLOOP_MAX				129
-#define _SC_ADVISORY_INFO			130
-#define _SC_FILE_LOCKING			131
-#define _SC_2_PBS				132
-#define _SC_2_PBS_ACCOUNTING			133
-#define _SC_2_PBS_CHECKPOINT			134
-#define _SC_2_PBS_LOCATE			135
-#define _SC_2_PBS_MESSAGE			136
-#define _SC_2_PBS_TRACK				137
-#define _SC_BARRIERS				138
-#define _SC_CLOCK_SELECTION			139
-#define _SC_CPUTIME				140
-#define _SC_MONOTONIC_CLOCK			141
-#define _SC_READER_WRITER_LOCKS			142
-#define _SC_SPAWN				143
-#define _SC_SPIN_LOCKS				144
-#define _SC_SPORADIC_SERVER			145
-#define _SC_THREAD_CPUTIME			146
-#define _SC_THREAD_SPORADIC_SERVER              147
-#define _SC_TIMEOUTS				148
-#define _SC_TRACE				149
-#define _SC_TRACE_EVENT_FILTER			150
-#define _SC_TRACE_INHERIT			151
-#define _SC_TRACE_LOG				152
-#define _SC_TYPED_MEMORY_OBJECTS		153
-#define _SC_IPV6				154
-#define _SC_RAW_SOCKETS				155
-#define _SC_SS_REPL_MAX				156
-#define _SC_TRACE_EVENT_NAME_MAX		157
-#define _SC_TRACE_NAME_MAX			158
-#define _SC_TRACE_SYS_MAX			159
-#define _SC_TRACE_USER_EVENT_MAX		160
-#endif /* _POSIX_C_SOURCE >= 200112L */
-
-#ifdef _ALL_SOURCE
-#define _SC_AIX_UKEYS				161
-#endif /* _ALL_SOURCE */
-
-#endif /* _POSIX_SOURCE */
-
-
-#if _XOPEN_SOURCE_EXTENDED==1
-#ifdef _LARGE_FILES
-#define	ftruncate	ftruncate64
-#define	truncate	truncate64
-#endif
-
-#ifndef _H_LOCKF
-#include <sys/lockf.h>		/* lockf definitions for portability	*/
-#endif
-
-#ifdef _NO_PROTO
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern int		brk();
-extern int              getpagesize();
-#ifndef _MSGQSUPPORT
-extern int		__fd_getdtablesize();
-static int		getdtablesize()
-{
-    return __fd_getdtablesize();
-}
-#else
-extern int              getdtablesize();
-#endif /* _MSGQSUPPORT */
-
-extern void             *sbrk();
-#endif /* _POSIX_C_SOURCE<200112L */
-extern int		fchdir();
-extern int		fchown();
-extern int		ftruncate();
-extern long		gethostid();
-extern int		gethostname();
-extern pid_t		getpgid();
-extern pid_t		getsid();
-extern char		*getwd();
-extern int		lchown();
-extern int		readlink();
-extern pid_t		setpgrp();
-extern int		setregid();
-extern int		setreuid();
-extern int		symlink();
-extern void		sync();
-extern int		truncate();
-extern useconds_t	ualarm();
-extern int		usleep();
-extern pid_t		vfork();
-#else /* _NO_PROTO */
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE<200112L) || defined(_ALL_SOURCE)
-extern int		brk(void *);
-extern int              getpagesize(void);
-#ifndef _MSGQSUPPORT
-extern int		__fd_getdtablesize(void);
-static int		getdtablesize()
-{
-    return __fd_getdtablesize();
-}
-#else
-extern int              getdtablesize(void);
-#endif /* _MSGQSUPPORT */
-#ifdef _LINUX_SOURCE_COMPAT
-extern void             *sbrk(ptrdiff_t);
-#elif (_XOPEN_SOURCE >= 500) || defined(__64BIT__)
-extern void             *sbrk(intptr_t);
-#else
-extern void             *sbrk(int);
-#endif
-#endif /* _POSIX_C_SOURCE<200112L */
-extern int		fchdir(int);
-extern int		fchown(int, uid_t, gid_t);
-extern int		ftruncate(int, off_t);
-#ifdef _LARGE_FILE_API
-extern int		ftruncate64(int, off64_t);
-#endif
-extern int		gethostname(char *, size_t);
-extern long		gethostid(void);
-extern pid_t		getpgid(pid_t);
-extern pid_t		getsid(pid_t);
-extern char		*getwd(char *);
-extern int		lchown(const char *, uid_t, gid_t);
-
-#if (defined(_SUSV3_READLINK) || \
-     (!defined(_ALL_SOURCE) && (_POSIX_C_SOURCE >= 200112L)))
-/* If SUSV3 readlink specifically requested or if strict SUSv3
- * environment requested */
-#ifdef __64BIT__
-static ssize_t readlink(const char *__restrict__ __path,
-                        char *__restrict__ __buf, size_t __bufsize)
-{
-    extern ssize_t __readlink64(const char *__restrict__, char *__restrict__, size_t);
-    return __readlink64(__path, __buf, __bufsize);
-}
-#else
-extern ssize_t readlink(const char *__restrict__, char *__restrict__, size_t);
-#endif /* __64BIT__ */
-#else
-extern int readlink(const char *, char *, size_t);
-#endif /* _SUSV3_READLINK || !_ALL_SOURCE && _POSIX_C_SOURCE >= 200112L */
-
-#ifndef _BSD
-extern pid_t		setpgrp(void);
-#endif /* _BSD */
-extern int		setregid(gid_t, gid_t);
-extern int		setreuid(uid_t, uid_t);
-extern int		symlink(const char *, const char *);
-extern void		sync(void);
-extern int		truncate(const char *, off_t);
-#ifdef _LARGE_FILE_API
-extern int		truncate64(const char *, off64_t);
-#endif
-extern useconds_t	ualarm(useconds_t, useconds_t);
-extern int		usleep(useconds_t);
-extern pid_t		vfork(void);
-#if _XOPEN_SOURCE>=500
-extern int		getlogin_r(char *, size_t);
-extern int		ttyname_r(int, char *, size_t);
-
-#ifdef _LARGE_FILES
-#define pread		pread64
-#define pwrite		pwrite64
-#endif /* _LARGE_FILES */
-
-extern ssize_t		pread(int, void *, size_t, off_t);
-extern ssize_t		pwrite(int, const void *, size_t, off_t);
-#ifdef _LARGE_FILE_API
-extern ssize_t		pread64(int, void *, size_t, off64_t);
-extern ssize_t		pwrite64(int, const void *, size_t, off64_t);
-#endif /* _LARGE_FILE_API */
-#endif /* _XOPEN_SOURCE>=500 */
-
-#endif /* _NO_PROTO */
-
-#endif /* _XOPEN_SOURCE_EXTENDED */
-
-#ifdef _ALL_SOURCE
-
-extern char **environ;
-
-#ifndef _KERNEL
-#ifdef _NO_PROTO
-extern pid_t		f_fork();
-#else /* _NO_PROTO */
-extern pid_t		f_fork(void);
-#endif /* _NO_PROTO */
-#endif	/* _KERNEL */
-
-#ifdef _NO_PROTO
-extern char *		cuserid();
-extern int		ioctl();
-#ifdef __64BIT__
-extern int		ioctlx();
-extern int		ioctl32();
-extern int		ioctl32x();
-#endif /* __64BIT__ */
-extern int		readx();
-extern int		setgroups();
-extern int		writex();
-extern int		setegid();
-extern int		seteuid();
-extern int		setrgid();
-extern int		setruid();
-extern offset_t		llseek();
-extern char *		getusershell();
-extern void		setusershell();
-extern void		endusershell();
-extern char *		get_current_dir_name();
-extern int		sysfs();
-#else
-extern char *		cuserid(char *);
-extern int		setegid(gid_t);
-extern int		seteuid(uid_t);
-extern int		setrgid(gid_t);
-extern int		setruid(uid_t);
-#ifndef _BSD
-extern int		ioctl(int, int, ...);
-#endif /* _BSD */
-#ifdef __64BIT__
-extern int		ioctlx(int, int, void *, long);
-extern int		ioctl32(int, int, ...);
-extern int		ioctl32x(int, int, unsigned int, unsigned int);
-#endif /* __64BIT__ */
-extern int		setgroups(int, gid_t []);
-#ifndef _KERNEL
-extern int	readx(int, char*, unsigned, long);
-extern int	writex(int, char*, unsigned, long);
-
-#ifdef _LARGE_FILES
-#define fclear fclear64
-#define	fsync_range	fsync_range64
-#endif
-extern off_t	fclear(int, off_t);
-extern int	fsync_range(int, int, off_t, off_t);
-#ifdef _LARGE_FILE_API
-extern off64_t	fclear64(int, off64_t);
-extern int	fsync_range64(int, int, off64_t, off64_t);
-#endif
-extern offset_t llseek(int, offset_t, int);
-extern char *	getusershell(void);
-extern void	setusershell(void);
-extern void	endusershell(void);
-extern char *	get_current_dir_name(void);
-extern int	sysfs(int, ...);
-extern int	finfo(const char *, int, void *, int32long64_t);
-extern int	ffinfo(int, int, void *, int32long64_t);
-
-#endif /* ndef _KERNEL */
-
-#endif /* _NO_PROTO */
-
-#define _AES_OS_VERSION 1               /* OSF, AES version */
-
-#endif /* _ALL_SOURCE */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _H_UNISTD */
diff --git a/hpvm/test/parboil/benchmarks/kmeans/tools/compare-output b/hpvm/test/parboil/benchmarks/kmeans/tools/compare-output
deleted file mode 100755
index 54e9f69609..0000000000
--- a/hpvm/test/parboil/benchmarks/kmeans/tools/compare-output
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import struct
-
-tol_diff = 0.001
-tol_ratio = 0.002
-
-def Exit(b):
-  if b:
-    print "Pass"
-    sys.exit(0)
-  else:
-    print "Mismatch"
-    sys.exit(1)
-
-def Run():
-  try:
-    hx = open(sys.argv[1], 'rb')
-    hy = open(sys.argv[2], 'rb')
-  except:
-    Exit(False)
-
-  try:
-    # size (int)
-    dx = hx.read(8)
-    dy = hy.read(8)
-
-    lx = struct.unpack("i", dx[0:4])[0]
-    ly = struct.unpack("i", dy[0:4])[0]
-    fx = struct.unpack("i", dx[4:8])[0]
-    fy = struct.unpack("i", dy[4:8])[0]
-  except:
-    Exit(False)
-  
-  data_r = hx.read()
-  data_c = hy.read()
-
-  hx.close()
-  hy.close()
-
-  if lx != ly or fx != fy:
-    print "Reference and compare are different in size"
-    Exit(False)
-  if len(data_r) != 4 * lx * fx:
-    print "Reference: sanity check failed"
-    Exit(False)
-  if len(data_c) != 4 * ly * fy:
-    print "Compare: sanity check failed"
-    Exit(False)
-
-  r = struct.unpack('<'+'f'*(lx*fx), data_r)
-  c = struct.unpack('<'+'f'*(lx*fx), data_c)
-  for i in range(0, lx*fx):
-
-    diff = abs(r[i] - c[i])
-    if not (diff <= tol_diff or diff < tol_ratio * abs(r[i])):
-      print i/fx, i%fx, ":" , r[i] , c[i]
-      Exit(False)
-
-  Exit(True)
-
-Run()
-
diff --git a/hpvm/test/parboil/benchmarks/linear-svm/Makefile b/hpvm/test/parboil/benchmarks/linear-svm/Makefile
deleted file mode 100644
index 2584e27835..0000000000
--- a/hpvm/test/parboil/benchmarks/linear-svm/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = linear-svm
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = visc_cm
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = small
-endif
-
-ifeq ($(PLATFORM),)
-PLATFORM=default
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
-MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/matrix3.txt
-RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/matrix3.txt
-
-ARGS = -i $(MATRIX1),$(MATRIX2) -o $(OUTPUT)
-TOOL = tools/compare-output
-#TOOL=echo
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/linear-svm/linear-svm.visc.ll b/hpvm/test/parboil/benchmarks/linear-svm/linear-svm.visc.ll
deleted file mode 100644
index 1fcd463e46..0000000000
--- a/hpvm/test/parboil/benchmarks/linear-svm/linear-svm.visc.ll
+++ /dev/null
@@ -1,501 +0,0 @@
-; ModuleID = 'build/visc_cm_default/main.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::ios_base::Init" = type { i8 }
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%struct.RootIn = type <{ float*, i64, float*, i64, float*, i64, i32 }>
-%emptyStruct = type <{}>
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%"class.std::vector" = type { %"struct.std::_Vector_base" }
-%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" }
-%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* }
-%struct.pb_Parameters = type { i8*, i8** }
-
-@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
-@__dso_handle = external global i8
-@stderr = external global %struct._IO_FILE*
-@.str = private unnamed_addr constant [31 x i8] c"Expecting two input filenames\0A\00", align 1
-@.str1 = private unnamed_addr constant [51 x i8] c"Xcol == Wcol && \22Width of X and W should be equal\22\00", align 1
-@.str2 = private unnamed_addr constant [20 x i8] c"src/visc_cm/main.cc\00", align 1
-@__PRETTY_FUNCTION__.main = private unnamed_addr constant [23 x i8] c"int main(int, char **)\00", align 1
-@.str3 = private unnamed_addr constant [47 x i8] c"Wrow == 1 && \22Number of rows of W should be 1\22\00", align 1
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-
-declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0
-
-declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0
-
-; Function Attrs: nounwind
-declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1
-
-; Function Attrs: nounwind uwtable
-define void @_Z8packDataP6RootInPfmS1_mS1_mi(%struct.RootIn* nocapture %args, float* %X, i64 %bytesX, float* %W, i64 %bytesW, float* %Y, i64 %bytesY, i32 %nrows) #2 {
-entry:
-  %X1 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 0
-  store float* %X, float** %X1, align 1, !tbaa !1
-  %bytesX2 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 1
-  store i64 %bytesX, i64* %bytesX2, align 1, !tbaa !4
-  %W3 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 2
-  store float* %W, float** %W3, align 1, !tbaa !1
-  %bytesW4 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 3
-  store i64 %bytesW, i64* %bytesW4, align 1, !tbaa !4
-  %Y5 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 4
-  store float* %Y, float** %Y5, align 1, !tbaa !1
-  %bytesY6 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 5
-  store i64 %bytesY, i64* %bytesY6, align 1, !tbaa !4
-  %nrows7 = getelementptr inbounds %struct.RootIn* %args, i64 0, i32 6
-  store i32 %nrows, i32* %nrows7, align 1, !tbaa !5
-  ret void
-}
-
-; Function Attrs: nounwind uwtable
-define %emptyStruct @_Z10LinearRootPimS_mS_mi(i32* in %X, i64 %bytesX, i32* in %W, i64 %bytesW, i32* out %Y, i64 %bytesY, i32 %nrows) #2 {
-entry:
-  %VL = call i32 @__visc__getVectorLength(i32 4) #1
-  %div = lshr i64 %bytesW, 2
-  %conv = trunc i64 %div to i32
-  %0 = zext i32 %VL to i64
-  %vla = alloca i32, i64 %0, align 16
-  %cmp30 = icmp sgt i32 %nrows, 0
-  br i1 %cmp30, label %for.body.lr.ph, label %for.end11
-
-for.body.lr.ph:                                   ; preds = %entry
-  %div2 = sdiv i32 %conv, %VL
-  %cmp327 = icmp sgt i32 %div2, 0
-  br i1 %cmp327, label %for.body.lr.ph.split.us, label %for.end
-
-for.body.lr.ph.split.us:                          ; preds = %for.body.lr.ph
-  %sext = shl i64 %div, 32
-  %1 = ashr exact i64 %sext, 32
-  br label %for.body4.lr.ph.us
-
-for.end.us:                                       ; preds = %for.body4.us
-  %2 = trunc i64 %indvars.iv32 to i32
-  %div8.us = sdiv i32 %2, %conv
-  %idxprom.us = sext i32 %div8.us to i64
-  %arrayidx.us = getelementptr inbounds i32* %Y, i64 %idxprom.us
-  store i32 %add.us, i32* %arrayidx.us, align 4, !tbaa !5
-  %indvars.iv.next33 = add i64 %indvars.iv32, %1
-  %3 = trunc i64 %indvars.iv.next33 to i32
-  %cmp.us = icmp slt i32 %3, %nrows
-  br i1 %cmp.us, label %for.body4.lr.ph.us, label %for.end11
-
-for.body4.us:                                     ; preds = %for.body4.lr.ph.us, %for.body4.us
-  %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ]
-  %dot.028.us = phi i32 [ 0, %for.body4.lr.ph.us ], [ %add.us, %for.body4.us ]
-  %4 = trunc i64 %indvars.iv to i32
-  %mul.us = mul nsw i32 %4, %VL
-  %idx.ext5.us = sext i32 %mul.us to i64
-  %add.ptr.sum.us = add i64 %idx.ext5.us, %indvars.iv32
-  %add.ptr6.us = getelementptr inbounds i32* %X, i64 %add.ptr.sum.us
-  %Xi.vec = load <%VL x i32> %add.ptr6.us
-  %W.vec = load <%VL x i32> %W
-  %temp = fmul <%VL x i32> %Xi.vec, %W.vec
-  %call7.us = call i32 @__visc__reduction_sum(<%VL x i32> %temp)
-  %add.us = add nsw i32 %call7.us, %dot.028.us
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %div2
-  br i1 %exitcond, label %for.end.us, label %for.body4.us
-
-for.body4.lr.ph.us:                               ; preds = %for.end.us, %for.body.lr.ph.split.us
-  %indvars.iv32 = phi i64 [ %indvars.iv.next33, %for.end.us ], [ 0, %for.body.lr.ph.split.us ]
-  br label %for.body4.us
-
-for.end:                                          ; preds = %for.end, %for.body.lr.ph
-  %rowID.031 = phi i32 [ %add10, %for.end ], [ 0, %for.body.lr.ph ]
-  %div8 = sdiv i32 %rowID.031, %conv
-  %idxprom = sext i32 %div8 to i64
-  %arrayidx = getelementptr inbounds i32* %Y, i64 %idxprom
-  store i32 0, i32* %arrayidx, align 4, !tbaa !5
-  %add10 = add nsw i32 %rowID.031, %conv
-  %cmp = icmp slt i32 %add10, %nrows
-  br i1 %cmp, label %for.end, label %for.end11
-
-for.end11:                                        ; preds = %for.end, %for.end.us, %entry
-  ret %emptyStruct undef
-}
-
-declare i32 @__visc__getVectorLength(i32) #0
-
-declare void @__visc__vector_mul(i32*, i32*, i32*, i32) #0
-
-declare i32 @__visc__reduction_sum(i32*, i32) #0
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #2 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  %Xrow = alloca i32, align 4
-  %Xcol = alloca i32, align 4
-  %Wrow = alloca i32, align 4
-  %Wcol = alloca i32, align 4
-  %X = alloca %"class.std::vector", align 8
-  %W = alloca %"class.std::vector", align 8
-  %Y = alloca %"class.std::vector", align 8
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !5
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %1 = bitcast %"class.std::vector"* %X to i8*
-  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1
-  %2 = bitcast %"class.std::vector"* %W to i8*
-  call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1
-  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
-  %3 = load i8*** %inpFiles, align 8, !tbaa !1
-  %4 = load i8** %3, align 8, !tbaa !1
-  %cmp = icmp eq i8* %4, null
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %arrayidx2 = getelementptr inbounds i8** %3, i64 1
-  %5 = load i8** %arrayidx2, align 8, !tbaa !1
-  %cmp3 = icmp eq i8* %5, null
-  br i1 %cmp3, label %if.then, label %lor.lhs.false4
-
-lor.lhs.false4:                                   ; preds = %lor.lhs.false
-  %arrayidx6 = getelementptr inbounds i8** %3, i64 2
-  %6 = load i8** %arrayidx6, align 8, !tbaa !1
-  %cmp7 = icmp eq i8* %6, null
-  br i1 %cmp7, label %if.end, label %if.then
-
-if.then:                                          ; preds = %lor.lhs.false4, %lor.lhs.false, %entry
-  %7 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1
-  %8 = call i64 @fwrite(i8* getelementptr inbounds ([31 x i8]* @.str, i64 0, i64 0), i64 30, i64 1, %struct._IO_FILE* %7)
-  call void @exit(i32 -1) #6
-  unreachable
-
-if.end:                                           ; preds = %lor.lhs.false4
-  %call11 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %Xrow, i32* %Xcol, %"class.std::vector"* %X) #1
-  %9 = load i8*** %inpFiles, align 8, !tbaa !1
-  %arrayidx13 = getelementptr inbounds i8** %9, i64 1
-  %10 = load i8** %arrayidx13, align 8, !tbaa !1
-  %call14 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %10, i32* %Wrow, i32* %Wcol, %"class.std::vector"* %W) #1
-  %11 = load i32* %Xcol, align 4, !tbaa !5
-  %12 = load i32* %Wcol, align 4, !tbaa !5
-  %cmp15 = icmp eq i32 %11, %12
-  br i1 %cmp15, label %cond.end, label %cond.false
-
-cond.false:                                       ; preds = %if.end
-  call void @__assert_fail(i8* getelementptr inbounds ([51 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([20 x i8]* @.str2, i64 0, i64 0), i32 130, i8* getelementptr inbounds ([23 x i8]* @__PRETTY_FUNCTION__.main, i64 0, i64 0)) #6
-  unreachable
-
-cond.end:                                         ; preds = %if.end
-  %13 = load i32* %Wrow, align 4, !tbaa !5
-  %cmp16 = icmp eq i32 %13, 1
-  br i1 %cmp16, label %cond.end19, label %cond.false18
-
-cond.false18:                                     ; preds = %cond.end
-  call void @__assert_fail(i8* getelementptr inbounds ([47 x i8]* @.str3, i64 0, i64 0), i8* getelementptr inbounds ([20 x i8]* @.str2, i64 0, i64 0), i32 131, i8* getelementptr inbounds ([23 x i8]* @__PRETTY_FUNCTION__.main, i64 0, i64 0)) #6
-  unreachable
-
-cond.end19:                                       ; preds = %cond.end
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %14 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %14, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  call void @llvm.visc.init()
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %15 = load i32* %Xrow, align 4, !tbaa !5
-  %16 = load i32* %Xcol, align 4, !tbaa !5
-  %mul = mul nsw i32 %16, %15
-  %conv = sext i32 %mul to i64
-  %mul20 = shl nsw i64 %conv, 2
-  %17 = load i32* %Wcol, align 4, !tbaa !5
-  %conv21 = sext i32 %17 to i64
-  %mul22 = shl nsw i64 %conv21, 2
-  %conv23 = sext i32 %15 to i64
-  %mul24 = shl nsw i64 %conv23, 2
-  %18 = bitcast %"class.std::vector"* %Y to i8*
-  call void @llvm.memset.p0i8.i64(i8* %18, i8 0, i64 24, i32 8, i1 false) #1
-  %cmp.i.i.i.i = icmp eq i32 %15, 0
-  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i
-
-cond.true.i.i.i.i:                                ; preds = %cond.end19
-  %cmp.i.i.i.i.i = icmp slt i32 %15, 0
-  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6
-
-if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
-  call void @_ZSt17__throw_bad_allocv() #6
-  unreachable
-
-_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i
-  %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul24) #1
-  %19 = bitcast i8* %call2.i.i.i.i.i to float*
-  br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-
-_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i:    ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %cond.end19
-  %cond.i.i.i.i = phi float* [ %19, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %cond.end19 ]
-  %_M_start.i.i.i68 = getelementptr inbounds %"class.std::vector"* %Y, i64 0, i32 0, i32 0, i32 0
-  store float* %cond.i.i.i.i, float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %Y, i64 0, i32 0, i32 0, i32 1
-  store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !1
-  %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv23
-  %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %Y, i64 0, i32 0, i32 0, i32 2
-  store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !1
-  br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i
-
-for.body.lr.ph.i.i.i.i.i.i.i.i:                   ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv23, 7
-  %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv23, %n.mod.vf.i.i.i.i.i.i.i.i
-  %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv23
-  %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i
-  br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-vector.body.i.i.i.i.i.i.i.i:                      ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ]
-  %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i
-  %20 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %20, align 4
-  %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4
-  %21 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i
-  %22 = bitcast float* %21 to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %22, align 4
-  %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8
-  %23 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i
-  br i1 %23, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-middle.block.i.i.i.i.i.i.i.i:                     ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv23, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv23
-  br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader
-
-for.body.i.i.i.i.i.i.i.i.preheader:               ; preds = %middle.block.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i86 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8*
-  %24 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2
-  call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i86, i8 0, i64 %24, i32 4, i1 false)
-  br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-
-_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit:            ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !1
-  %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %X, i64 0, i32 0, i32 0, i32 0
-  %25 = load float** %_M_start.i.i, align 8, !tbaa !1
-  %26 = bitcast float* %25 to i8*
-  call void @llvm_visc_track_mem(i8* %26, i64 %mul20) #1
-  %_M_start.i.i78 = getelementptr inbounds %"class.std::vector"* %W, i64 0, i32 0, i32 0, i32 0
-  %27 = load float** %_M_start.i.i78, align 8, !tbaa !1
-  %28 = bitcast float* %27 to i8*
-  call void @llvm_visc_track_mem(i8* %28, i64 %mul22) #1
-  %29 = load float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %30 = bitcast float* %29 to i8*
-  call void @llvm_visc_track_mem(i8* %30, i64 %mul24) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %31 = load float** %_M_finish.i.i.i, align 8, !tbaa !1
-  %32 = load float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %cmp3184 = icmp eq float* %31, %32
-  br i1 %cmp3184, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  %sub.ptr.lhs.cast.i = ptrtoint float* %31 to i64
-  %sub.ptr.rhs.cast.i = ptrtoint float* %32 to i64
-  %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i
-  %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %i.085 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %add.ptr.i = getelementptr inbounds float* %32, i64 %i.085
-  store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !7
-  %inc = add i64 %i.085, 1
-  %cmp31 = icmp ult i64 %inc, %sub.ptr.div.i
-  br i1 %cmp31, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  %33 = phi float* [ %31, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit ], [ %32, %for.body ]
-  %call33 = call noalias i8* @malloc(i64 52) #1
-  %34 = load float** %_M_start.i.i, align 8, !tbaa !1
-  %35 = load float** %_M_start.i.i78, align 8, !tbaa !1
-  %36 = load i32* %Xrow, align 4, !tbaa !5
-  %X1.i = bitcast i8* %call33 to float**
-  store float* %34, float** %X1.i, align 1, !tbaa !1
-  %bytesX2.i = getelementptr inbounds i8* %call33, i64 8
-  %37 = bitcast i8* %bytesX2.i to i64*
-  store i64 %mul20, i64* %37, align 1, !tbaa !4
-  %W3.i = getelementptr inbounds i8* %call33, i64 16
-  %38 = bitcast i8* %W3.i to float**
-  store float* %35, float** %38, align 1, !tbaa !1
-  %bytesW4.i = getelementptr inbounds i8* %call33, i64 24
-  %39 = bitcast i8* %bytesW4.i to i64*
-  store i64 %mul22, i64* %39, align 1, !tbaa !4
-  %Y5.i = getelementptr inbounds i8* %call33, i64 32
-  %40 = bitcast i8* %Y5.i to float**
-  store float* %33, float** %40, align 1, !tbaa !1
-  %bytesY6.i = getelementptr inbounds i8* %call33, i64 40
-  %41 = bitcast i8* %bytesY6.i to i64*
-  store i64 %mul24, i64* %41, align 1, !tbaa !4
-  %nrows7.i = getelementptr inbounds i8* %call33, i64 48
-  %42 = bitcast i8* %nrows7.i to i32*
-  store i32 %36, i32* %42, align 1, !tbaa !5
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 21) #1
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct (i32*, i64, i32*, i64, i32*, i64, i32)* @_Z10LinearRootPimS_mS_mi to i8*), i8* %call33, i1 false)
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  %43 = load float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %44 = bitcast float* %43 to i8*
-  call void @llvm_visc_request_mem(i8* %44, i64 %mul24) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %45 = load float** %_M_start.i.i, align 8, !tbaa !1
-  %46 = bitcast float* %45 to i8*
-  call void @llvm_visc_untrack_mem(i8* %46) #1
-  %47 = load float** %_M_start.i.i78, align 8, !tbaa !1
-  %48 = bitcast float* %47 to i8*
-  call void @llvm_visc_untrack_mem(i8* %48) #1
-  %49 = load float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %50 = bitcast float* %49 to i8*
-  call void @llvm_visc_untrack_mem(i8* %50) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  call void @llvm.visc.cleanup()
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
-  %51 = load i8** %outFile, align 8, !tbaa !1
-  %tobool = icmp eq i8* %51, null
-  br i1 %tobool, label %if.end45, label %if.then42
-
-if.then42:                                        ; preds = %for.end
-  %52 = load i32* %Xrow, align 4, !tbaa !5
-  %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %51, i32 %52, i32 1, %"class.std::vector"* %Y) #1
-  br label %if.end45
-
-if.end45:                                         ; preds = %if.then42, %for.end
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
-  %53 = load float** %_M_start.i.i.i68, align 8, !tbaa !1
-  %tobool.i.i.i.i65 = icmp eq float* %53, null
-  br i1 %tobool.i.i.i.i65, label %_ZNSt6vectorIfSaIfEED1Ev.exit67, label %if.then.i.i.i.i66
-
-if.then.i.i.i.i66:                                ; preds = %if.end45
-  %54 = bitcast float* %53 to i8*
-  call void @_ZdlPv(i8* %54) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit67
-
-_ZNSt6vectorIfSaIfEED1Ev.exit67:                  ; preds = %if.then.i.i.i.i66, %if.end45
-  %55 = load float** %_M_start.i.i78, align 8, !tbaa !1
-  %tobool.i.i.i.i61 = icmp eq float* %55, null
-  br i1 %tobool.i.i.i.i61, label %_ZNSt6vectorIfSaIfEED1Ev.exit63, label %if.then.i.i.i.i62
-
-if.then.i.i.i.i62:                                ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit67
-  %56 = bitcast float* %55 to i8*
-  call void @_ZdlPv(i8* %56) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit63
-
-_ZNSt6vectorIfSaIfEED1Ev.exit63:                  ; preds = %if.then.i.i.i.i62, %_ZNSt6vectorIfSaIfEED1Ev.exit67
-  %57 = load float** %_M_start.i.i, align 8, !tbaa !1
-  %tobool.i.i.i.i = icmp eq float* %57, null
-  br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i
-
-if.then.i.i.i.i:                                  ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit63
-  %58 = bitcast float* %57 to i8*
-  call void @_ZdlPv(i8* %58) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit
-
-_ZNSt6vectorIfSaIfEED1Ev.exit:                    ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit63
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0
-
-; Function Attrs: noreturn nounwind
-declare void @exit(i32) #3
-
-declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0
-
-; Function Attrs: noreturn nounwind
-declare void @__assert_fail(i8*, i8*, i32, i8*) #3
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
-
-declare void @llvm_visc_track_mem(i8*, i64) #0
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #4
-
-declare void @llvm_visc_request_mem(i8*, i64) #0
-
-declare void @llvm_visc_untrack_mem(i8*) #0
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
-
-declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-; Function Attrs: noreturn
-declare void @_ZSt17__throw_bad_allocv() #5
-
-declare noalias i8* @_Znwm(i64) #0
-
-; Function Attrs: nounwind
-declare void @_ZdlPv(i8*) #4
-
-; Function Attrs: nounwind
-define internal void @_GLOBAL__I_a() #1 section ".text.startup" {
-entry:
-  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1
-  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #3 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #4 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #6 = { noreturn nounwind }
-
-!visc_hint_gpu = !{!0}
-
-!0 = metadata !{%emptyStruct (i32*, i64, i32*, i64, i32*, i64, i32)* @_Z10LinearRootPimS_mS_mi}
-!1 = metadata !{metadata !"any pointer", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA"}
-!4 = metadata !{metadata !"long", metadata !2}
-!5 = metadata !{metadata !"int", metadata !2}
-!6 = metadata !{metadata !"branch_weights", i32 4, i32 64}
-!7 = metadata !{metadata !"float", metadata !2}
diff --git a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/Makefile b/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/Makefile
deleted file mode 100644
index 870b501b17..0000000000
--- a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-APP_OPTFLAGS=-unroll-threshold=300 -loop-unroll -scalarrepl
diff --git a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/io.cc b/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/main.cc b/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/main.cc
deleted file mode 100644
index 1877c6f0d8..0000000000
--- a/hpvm/test/parboil/benchmarks/linear-svm/src/visc_cm/main.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-typedef struct __attribute__((__packed__)) {
-    float *X;
-    size_t bytesX;
-    float *W;
-    size_t bytesW;
-    float *Y;
-    size_t bytesY;
-    int nrows;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *X, size_t bytesX,
-              float *W, size_t bytesW,
-              float *Y, size_t bytesY,
-              int nrows
-              ) {
-    args->X = X;
-    args->bytesX = bytesX;
-    args->W = W;
-    args->bytesW = bytesW;
-    args->Y = Y;
-    args->bytesY = bytesY;
-    args->nrows = nrows;
-}
-
-void LinearRoot( float* X, size_t bytesX, float* W, size_t bytesW, float* Y, size_t bytesY, int nrows)
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, X, W, 1, Y);
-
-    //void* thisNode = __visc__getNode();
-
-    //int rowID = __visc__getNodeInstanceID_x(thisNode);
-
-    int length = __visc__getVectorLength();
-    float temp[length];
-    for(int rowID = 0; rowID < nrows; rowID++) {
-      int offsetX = rowID*length;
-      __visc__vector_mul(X+offsetX, W, temp, length);
-      Y[rowID] = __visc__reduction_sum(temp, length);
-    }
-}
-
-// Root node for linear SVM
-//void LinearRoot(float *X, size_t bytesX,
-               //float *W, size_t bytesW,
-               //float *Y, size_t bytesY,
-               //int nrows
-               //) {
-    //__visc__hint(visc::DEVICE);
-    //__visc__attributes(2, X, W, 1, Y);
-    //void* LinearLeafNode = __visc__createNode1D(LinearLeaf, nrows);
-
-    //Bind edges
-    //__visc__bindIn(LinearLeafNode, 0, 0, 0); // Bind X
-    //__visc__bindIn(LinearLeafNode, 1, 1, 0); // Bind bytesX
-    //__visc__bindIn(LinearLeafNode, 2, 2, 0); // Bind W
-    //__visc__bindIn(LinearLeafNode, 3, 3, 0); // Bind bytesW
-    //__visc__bindIn(LinearLeafNode, 4, 4, 0); // Bind Y
-    //__visc__bindIn(LinearLeafNode, 5, 5, 0); // Bind bytesY
-
-//}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t X_sz, Y_sz, W_sz;
-    int Xrow, Xcol, Wrow, Wcol;
-    std::vector<float> X, W;
-
-    /* Read command line. Expect 2 inputs: X, W
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] != NULL))
-    {
-        fprintf(stderr, "Expecting two input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load X
-    readColMajorMatrixFile(params->inpFiles[0],
-                           Xrow, Xcol, X);
-
-    // load W
-    readColMajorMatrixFile(params->inpFiles[1],
-                           Wrow, Wcol, W);
-
-    assert (Xcol == Wcol && "Width of X and W should be equal");
-    assert (Wrow == 1 && "Number of rows of W should be 1");
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    X_sz = Xrow*Xcol*sizeof(float);
-    W_sz = Wcol*sizeof(float);
-
-    Y_sz = Xrow*sizeof(float);
-
-    std::vector<float> Y(Xrow);
-
-    llvm_visc_track_mem(&X.front(), X_sz);
-    llvm_visc_track_mem(&W.front(), W_sz);
-    llvm_visc_track_mem(&Y.front(), Y_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<Y.size(); i++)
-        Y[i] = 0.0f;
-
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             &X.front(), X_sz,
-             &W.front(), W_sz,
-             &Y.front(), Y_sz,
-             Xrow
-            );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION );
-    void* linearDFG = __visc__launch(0, LinearRoot, (void*) args);
-
-    __visc__wait(linearDFG);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE );
-
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&Y.front(), Y_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&X.front());
-    llvm_visc_untrack_mem(&W.front());
-    llvm_visc_untrack_mem(&Y.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write Y to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                Xrow, 1, Y);
-    }
-
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/llvm-40-34.py b/hpvm/test/parboil/benchmarks/llvm-40-34.py
deleted file mode 100644
index 79cbc0e760..0000000000
--- a/hpvm/test/parboil/benchmarks/llvm-40-34.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import sys
-import re
-
-regexSourceFile = re.compile(r"^source_filename.*")
-regexMetadataLine = re.compile(r"^![0-9].*")
-regexGEP = re.compile(r"(?P<prefix>.*)getelementptr(?P<attr>[A-Za-z ]*)? [<0-9 ]*[A-Zx>a-z 0-9%\.]*,")
-regexGEPArr = re.compile(r"(?P<prefix>.*)getelementptr(?P<attr>[A-Za-z ]*)? \[.*],")
-regexLoad = re.compile(r"(?P<prefix>.*)= load [<0-9 ]*[A-zx> 0-9%\.]*,")
-regexFast = re.compile(r"(?P<prefix>.*)fast")
-regexWriteOnly = re.compile(r"(?P<prefix>.*)writeonly")
-regexArgmemonly = re.compile(r"(?P<prefix>.*)argmemonly")
-regexNonnull = re.compile(r"(?P<prefix>.*)nonnull")
-
-with open(sys.argv[1], 'r') as file_LLVM_40:
-    with open(sys.argv[2], "w") as file_LLVM_34:
-        for line in file_LLVM_40:
-            newLine = line
-            if (regexSourceFile.match(line)):
-                newLine = regexSourceFile.sub("", line)
-            elif (regexMetadataLine.match(line)):
-                newLine = "".join((line[0], line[1:].replace(line[0], "metadata !")))
-                newLine = newLine.replace("distinct", "")
-            elif (regexGEPArr.match(line)):
-                newLine = regexGEPArr.sub("\g<prefix>getelementptr\g<attr>", line)
-            elif (regexGEP.match(line)):
-                newLine = regexGEP.sub("\g<prefix>getelementptr\g<attr>", line)
-            elif (regexLoad.match(line)):
-                newLine = regexLoad.sub("\g<prefix>= load", line)
-            elif (regexFast.match(line)):
-                newLine = regexFast.sub("\g<prefix>", line)
-            elif (regexWriteOnly.match(line)):
-                newLine = regexWriteOnly.sub("\g<prefix>", line)
-            elif (regexArgmemonly.match(line)):
-                newLine = regexArgmemonly.sub("\g<prefix>", line)
-            elif (regexNonnull.match(line)):
-                newLine = regexNonnull.sub("\g<prefix>", line)
-            file_LLVM_34.write(newLine)
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/DESCRIPTION b/hpvm/test/parboil/benchmarks/merge-tests/DESCRIPTION
deleted file mode 100644
index 3ecb1184ad..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/DESCRIPTION
+++ /dev/null
@@ -1,10 +0,0 @@
-A register-tiled matrix-matrix multiplication, with default column-majored
-layout on matrix A and C, but B is transposed.
-
-src/cuda/
-        CUDA version of sgemm('N', 'T', ...)
-
-See also:
-
-Volkov, V., and Demmel, J. W. 2008. Benchmarking GPUs to tune dense linear      
-algebra, 2008 ACM/IEEE Conference on Supercomputing (SC08)
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/Makefile
deleted file mode 100644
index 6f0f7a5942..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = merge-test
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = 2ILeaf
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = small
-endif
-
-ifeq ($(PLATFORM),)
-PLATFORM=default
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
-MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
-MATRIX2T = $(DATASET_DIR)/$(TEST)/input/matrix2t.txt
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/matrix3.txt
-RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/matrix3.txt
-
-ARGS = -i $(MATRIX1),$(MATRIX2),$(MATRIX2T) -o $(OUTPUT)
-TOOL = tools/compare-output
-#TOOL=echo
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/main.cc
deleted file mode 100644
index cca5003cde..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2DLeaf/main.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    C[lx+ly*dimx] = C[lx+ly*dimx] + A[lx+ly*dimx] * B[lx+ly*dimx];
-    __visc__return(bytesA);
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    D[lx+ly*dimx] = D[lx+ly*dimx] + A[lx+ly*dimx] + B[lx+ly*dimx];
-    __visc__return(bytesA);
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, grid_x, grid_y);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, grid_x, grid_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    //__visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA [Pass by edge]
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 7, 5, 0); // Bind bytesD
-
-    // Edges
-    __visc__edge(LeafMulNode, LeafSumNode, 0, 0, 1, 0); // edge bytesA
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // bind output bytesA
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int grid_x = m;
-    int grid_y = n;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/main.cc
deleted file mode 100644
index 3e9eef583e..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeaf/main.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    C[lx+ly*dimx] = C[lx+ly*dimx] + A[lx+ly*dimx] * B[lx+ly*dimx];
-    __visc__return(bytesC);
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    D[lx+ly*dimx] = D[lx+ly*dimx] + A[lx+ly*dimx] + B[lx+ly*dimx];
-    __visc__return(bytesD);
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, grid_x, grid_y);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, grid_x, grid_y);
-
-    // Bind edges
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind edges
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 7, 5, 0); // Bind bytesD
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // bind output bytesC
-    __visc__bindOut(LeafSumNode, 0, 1, 0); // bind output bytesD
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int grid_x = m;
-    int grid_y = n;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/main.cc
deleted file mode 100644
index ca1449a3df..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafD/main.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    C[lx+ly*dimx] = C[lx+ly*dimx] + A[lx+ly*dimx] * B[lx+ly*dimx];
-    __visc__return(bytesC);
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    D[lx+ly*dimx] = D[lx+ly*dimx] + A[lx+ly*dimx] + B[lx+ly*dimx];
-    __visc__return(bytesD);
-}
-
-void LeafDest(size_t bytesC, size_t bytesD) {
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(0, 0);
-
-    __visc__return(bytesC, bytesD);
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, grid_x, grid_y);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, grid_x, grid_y);
-    void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind edges
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind edges
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 7, 5, 0); // Bind bytesD
-
-    // Bind Edges
-    __visc__edge(LeafMulNode, LeafDestNode, 1, 0, 0, 0); // Bind bytesC
-    __visc__edge(LeafSumNode, LeafDestNode, 1, 0, 1, 0); // Bind bytesD
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(LeafDestNode, 0, 0, 0); // bind output bytesC
-    __visc__bindOut(LeafDestNode, 1, 1, 0); // bind output bytesD
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int grid_x = m;
-    int grid_y = n;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/main.cc
deleted file mode 100644
index 4211cf6fb0..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2ILeafS/main.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafSrc( size_t bytesC, size_t bytesD) {
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(0, 0);
-
-    __visc__return(bytesC, bytesD);
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    C[lx+ly*dimx] = C[lx+ly*dimx] + A[lx+ly*dimx] * B[lx+ly*dimx];
-    __visc__return(bytesC);
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    D[lx+ly*dimx] = D[lx+ly*dimx] + A[lx+ly*dimx] + B[lx+ly*dimx];
-    __visc__return(bytesD);
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* LeafSrcNode = __visc__createNode(LeafSrc);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, grid_x, grid_y);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, grid_x, grid_y);
-
-    // Bind edges
-    __visc__edge(LeafSrcNode, LeafMulNode, 1, 0, 5, 0); // Bind bytesC
-    __visc__edge(LeafSrcNode, LeafSumNode, 1, 1, 5, 0); // Bind bytesD
-
-    // Bind edges
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    //__visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind edges
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 6, 4, 0); // Bind D
-    //__visc__bindIn(LeafSumNode, 7, 5, 0); // Bind bytesD
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // bind output bytesC
-    __visc__bindOut(LeafSumNode, 0, 1, 0); // bind output bytesD
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int grid_x = m;
-    int grid_y = n;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/main.cc
deleted file mode 100644
index 3f7df181b1..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACAC/main.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int block_x;
-    int block_y;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int block_x,
-              int block_y,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void AllocationMul(int block_x, int block_y) {
-  void* localMem = __visc__malloc(block_x*block_y*sizeof(float));
-
-  __visc__return(localMem, block_x*block_y*sizeof(float));
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-              float* localMem, size_t bytesLocalMem) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-    void* AllocationNode = __visc__createNode(AllocationMul);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    __visc__bindIn(AllocationNode, 6, 0, 0); // Bind block_x
-    __visc__bindIn(AllocationNode, 7, 1, 0); // Bind block_y
-
-    // Edge
-    __visc__edge(AllocationNode, LeafMulNode, 1, 0, 6, 0);
-    __visc__edge(AllocationNode, LeafMulNode, 1, 1, 7, 0);
-
-    // Bind outputs
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
-
-}
-
-void AllocationSum(int block_x, int block_y) {
-  void* localMem = __visc__malloc(block_x*block_y*sizeof(float));
-
-  __visc__return(localMem, block_x*block_y*sizeof(float));
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-              float* localMem, size_t bytesLocalMem) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-    void* AllocationNode = __visc__createNode(AllocationSum);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
-
-    __visc__bindIn(AllocationNode, 6, 0, 0); // Bind block_x
-    __visc__bindIn(AllocationNode, 7, 1, 0); // Bind block_y
-
-    // Edge
-    __visc__edge(AllocationNode, LeafSumNode, 1, 0, 6, 0);
-    __visc__edge(AllocationNode, LeafSumNode, 1, 1, 7, 0);
-
-    // Bind outputs
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
-}
-
-//void LeafDest(size_t bytesC, size_t bytesD) {
-    //__visc__hint(visc::DEVICE);
-    //__visc__attributes(0, 0);
-
-    //__visc__return(bytesC, bytesD);
-//}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int block_x,
-               int block_y,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
-    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
-    //void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind inputs
-    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
-    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
-
-    // Bind inputs
-    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
-    //__visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
-    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
-    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
-
-    // Bind Edges
-    __visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(InternalSumNode, 0, 0, 0); // bind output bytesA
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int block_x = 16;
-    int block_y = 16;
-
-    int grid_x = m/block_x;
-    int grid_y = n/block_y;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/main.cc
deleted file mode 100644
index 3205e11b41..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelACC/main.cc
+++ /dev/null
@@ -1,378 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int block_x;
-    int block_y;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int block_x,
-              int block_y,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void AllocationMul(int block_x, int block_y) {
-  void* localMem = __visc__malloc(block_x*block_y*sizeof(float));
-
-  __visc__return(localMem, block_x*block_y*sizeof(float));
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-              float* localMem, size_t bytesLocalMem) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-    void* AllocationNode = __visc__createNode(AllocationMul);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    __visc__bindIn(AllocationNode, 6, 0, 0); // Bind block_x
-    __visc__bindIn(AllocationNode, 7, 1, 0); // Bind block_y
-
-    // Edge
-    __visc__edge(AllocationNode, LeafMulNode, 1, 0, 6, 0);
-    __visc__edge(AllocationNode, LeafMulNode, 1, 1, 7, 0);
-
-    // Bind outputs
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
-
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
-
-    // Bind outputs
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int block_x,
-               int block_y,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
-    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
-    //void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind inputs
-    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
-    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
-
-    // Bind inputs
-    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
-    //__visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
-    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
-    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
-
-    // Bind Edges
-    __visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(InternalSumNode, 0, 0, 0); // bind output bytesA
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int block_x = 16;
-    int block_y = 16;
-
-    int grid_x = m/block_x;
-    int grid_y = n/block_y;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/main.cc
deleted file mode 100644
index 49ca20840f..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCAC/main.cc
+++ /dev/null
@@ -1,378 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int block_x;
-    int block_y;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int block_x,
-              int block_y,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind outputs
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
-
-}
-
-void AllocationSum(int block_x, int block_y) {
-  void* localMem = __visc__malloc(block_x*block_y*sizeof(float));
-
-  __visc__return(localMem, block_x*block_y*sizeof(float));
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-              float* localMem, size_t bytesLocalMem) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-    void* AllocationNode = __visc__createNode(AllocationSum);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
-
-    __visc__bindIn(AllocationNode, 6, 0, 0); // Bind block_x
-    __visc__bindIn(AllocationNode, 7, 1, 0); // Bind block_y
-
-    // Edge
-    __visc__edge(AllocationNode, LeafSumNode, 1, 0, 6, 0);
-    __visc__edge(AllocationNode, LeafSumNode, 1, 1, 7, 0);
-
-    // Bind outputs
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
-}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int block_x,
-               int block_y,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
-    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
-    //void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind inputs
-    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
-    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
-
-    // Bind inputs
-    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
-    //__visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
-    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
-    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
-
-    // Bind Edges
-    __visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(InternalSumNode, 0, 0, 0); // bind output bytesA
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int block_x = 16;
-    int block_y = 16;
-
-    int grid_x = m/block_x;
-    int grid_y = n/block_y;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/main.cc
deleted file mode 100644
index ff9ab76212..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelCC/main.cc
+++ /dev/null
@@ -1,381 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-extern"C" {
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int block_x;
-    int block_y;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int block_x,
-              int block_y,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
-    __visc__return(dimx);
-}
-
-void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind outputs
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind dimx
-
-}
-
-void LeafSum( float* A, size_t bytesA,
-              float* B, size_t bytesB,
-              float* D, size_t bytesD,
-              int dummy)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
-    __visc__return(dummy);
-}
-
-void InternalSum( float* A, size_t bytesA,
-                  float* B, size_t bytesB,
-                  float* D, size_t bytesD,
-                  int block_x, int block_y,
-                  int dummy) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
-    __visc__bindIn(LeafSumNode, 8, 6, 0); // Bind dummy
-
-    // Bind outputs
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind dummy
-}
-
-//void LeafDest(size_t bytesC, size_t bytesD) {
-    //__visc__hint(visc::DEVICE);
-    //__visc__attributes(0, 0);
-
-    //__visc__return(bytesC, bytesD);
-//}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int block_x,
-               int block_y,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
-    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
-    //void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind inputs
-    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
-    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
-
-    // Bind inputs
-    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
-    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
-    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
-
-    // Bind Edges
-    __visc__edge(InternalMulNode, InternalSumNode, 0, 0, 8, 0); // Bind dummy
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(InternalSumNode, 0, 0, 0); // bind output bytesA
-}
-
-} // extern C
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int block_x = 16;
-    int block_y = 16;
-
-    int grid_x = m/block_x;
-    int grid_y = n/block_y;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile
deleted file mode 100644
index 2d80f39a97..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc b/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc
deleted file mode 100644
index ce1fda4cf8..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <parboil.h>
-#include <visc.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    float *B;
-    size_t bytesB;
-    float *C;
-    size_t bytesC;
-    float *D;
-    size_t bytesD;
-    int block_x;
-    int block_y;
-    int grid_x;
-    int grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              float *B, size_t bytesB,
-              float *C, size_t bytesC,
-              float *D, size_t bytesD,
-              int block_x,
-              int block_y,
-              int grid_x,
-              int grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->D = D;
-    args->bytesD = bytesD;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
-}
-
-void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
-                  int block_x, int block_y ) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, C, 1, C);
-    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
-
-    // Bind outputs
-    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
-
-}
-
-void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
-{
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
-    int gridx = __visc__getNumNodeInstances_x(parentNode);
-    int gridy = __visc__getNumNodeInstances_y(parentNode);
-    //int dimy = __visc__getNumNodeInstances_y(thisNode);
-
-    int x = gx*gridx+lx;
-    int y = gy*gridy+ly;
-    int dimx = blockDimx*gridx;
-
-    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
-    __visc__return(bytesA);
-}
-
-void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
-                  int block_x, int block_y) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    // TODO: shB is not an in or out attribute
-    __visc__attributes(3, A, B, D, 1, D);
-    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
-
-    // Bind inputs
-    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
-    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
-
-    // Bind outputs
-    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
-}
-
-//void LeafDest(size_t bytesC, size_t bytesD) {
-    //__visc__hint(visc::DEVICE);
-    //__visc__attributes(0, 0);
-
-    //__visc__return(bytesC, bytesD);
-//}
-
-// Root node for sgemm - Creates thread block node
-void Root(float *A, size_t bytesA,
-               float *B, size_t bytesB,
-               float *C, size_t bytesC,
-               float *D, size_t bytesD,
-               int block_x,
-               int block_y,
-               int grid_x,
-               int grid_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, A, B, C, D, 2, C, D);
-    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
-    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
-    //void* LeafDestNode = __visc__createNode(LeafDest);
-
-    // Bind inputs
-    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
-    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
-    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
-
-    // Bind inputs
-    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
-    __visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
-    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
-    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
-    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
-    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
-    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
-    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
-
-    // Bind Edges
-    //__visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
-
-    //TODO: bindOut : for now with out attribute
-    __visc__bindOut(InternalMulNode, 0, 0, 0); // bind output bytesA
-    __visc__bindOut(InternalSumNode, 0, 1, 0); // bind output bytesA
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
-
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
-
-//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-
-    int block_x = 16;
-    int block_y = 16;
-
-    int grid_x = m/block_x;
-    int grid_y = n/block_y;
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             B, bytesB,
-             C, bytesC,
-             D, bytesD,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* DFG = __visc__launch(0, Root, (void*) args);
-
-    __visc__wait(DFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
-}
-
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz, D_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
-
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
-    D_sz = matArow*matBcol*sizeof(float);
-
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
-    std::vector<float> matD(matArow*matBcol);
-
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    llvm_visc_track_mem(&matD.front(), D_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
-
-    for(size_t i=0; i<matD.size(); i++)
-        matD[i] = 0.0f;
-
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
-    llvm_visc_request_mem(&matD.front(), D_sz);
-
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    llvm_visc_untrack_mem(&matD.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-
-    if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
-
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
-
-    return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/merge-tests/tools/compare-output b/hpvm/test/parboil/benchmarks/merge-tests/tools/compare-output
deleted file mode 100755
index a951db7684..0000000000
--- a/hpvm/test/parboil/benchmarks/merge-tests/tools/compare-output
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-
-# (c) Copyright 2010 The Board of Trustees of the University of Illinois.
-
-import sys
-sys.path.insert(0, '../../common/python')
-
-import itertools
-
-import filecompare as fc
-import textfilecompare as tfc
-
-def compare_floats(ref_list, cmp_list):
-
-	# Lists should be the same length
-	if len(ref_list) != len(cmp_list):
-	  print "Different in length"
-	  print "ref=" + str(len(ref_list)) +" "+ str(ref_list[-2])
-	  print "cmp=" + str(len(cmp_list)) +" "+ str(cmp_list[-1])
-	  return False
-
-	return True
-
-	# Numbers should be equal with a tolerance of 1%
-	# or 0.01, whichever is greater.
-	for (r, c) in zip(ref_list, cmp_list):
-		diff = abs(r - c)
-		if not (diff < 0.01 or diff < 0.01 * abs(r)):
-			# Floats mismatch
-			return False
-
-	# All numbers are within tolerance
-	return True
-
-err = "Computed values do not match the expected values\n"
-
-comparison = fc.Then(
-	fc.Compare(tfc.floats, equal=compare_floats, message=err),
-	fc.Compare(tfc.eof)
-	)
-
-fc.default_main(comparison)
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/DESCRIPTION b/hpvm/test/parboil/benchmarks/mri-gridding/DESCRIPTION
deleted file mode 100644
index 74fa00dfd7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/DESCRIPTION
+++ /dev/null
@@ -1,4 +0,0 @@
-MRI-Gridding maps a non-uniform input data in 3-D space onto a regular 3-D grid of the same space by computing the contribution of every data point onto its neighboring grid points. The contributions are computed using the Kaiser-Bessel function on the distance between the input-output point pair.
-
-Original code written by Nady Obeid <nady.obeid@gmail.com>
-Maintain by Daniel Liu <gengliu2@illinois.edu>
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.c
deleted file mode 100644
index b81d24a268..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = ((float)(params.kernelWidth))/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                    // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;                  // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.h
deleted file mode 100644
index 42d40a0373..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/CPU_kernels.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/base/Makefile
deleted file mode 100644
index 953147064f..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=CPU_kernels.o main.o
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/base/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/main.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/base/main.c
deleted file mode 100644
index 12447e56fe..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/base/main.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "UDTypes.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  printf("  Number of samples = %d\n", p->numSamples);
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = (float)(params.aquisitionMatrixSize[0])/((float)(params.reconstructionMatrixSize[0])*(float)(params.kMax[0]));
-  kScale[1] = (float)(params.aquisitionMatrixSize[1])/((float)(params.reconstructionMatrixSize[1])*(float)(params.kMax[1]));
-  kScale[2] = (float)(params.aquisitionMatrixSize[2])/((float)(params.reconstructionMatrixSize[2])*(float)(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  int n;
-  for(n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples = (ReconstructionSample*) malloc (params.numSamples*sizeof(ReconstructionSample)); //Input Data
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  int gridNumElems = params.gridSize[0]*params.gridSize[1]*params.gridSize[2];
-
-  cmplx* gridData = (cmplx*) calloc (gridNumElems, sizeof(cmplx)); //Output Data
-  float* sampleDensity = (float*) calloc (gridNumElems, sizeof(float)); //Output Data
-
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate memory for input data\n");
-    exit(1);
-  }
-
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int passed=1;
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  free(samples);
-  free(gridData);
-  free(sampleDensity);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.c
deleted file mode 100644
index 43614d9a5f..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.h
deleted file mode 100644
index 1d883f00f7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CPU_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size,
-                 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.cu
deleted file mode 100644
index fec5270f58..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.cu
+++ /dev/null
@@ -1,268 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "scanLargeArray.h"
-#include "GPU_kernels.cu"
-#include "CPU_kernels.h"
-
-#define USE_CUDPP 0
-#if USE_CUDPP
-#include "cudpp.h"
-#else
-#include "sort.h"
-#include "scanLargeArray.h"
-#endif
-
-#define BLOCKSIZE 512
-#define PI 3.14159265359
-#define CUERR \
-  do { \
-    cudaError_t err; \
-    if ((err = cudaGetLastError()) != cudaSuccess) { \
-      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-      return; \
-    } \
-  } while (0)
-
-/***********************************************************************
- * CUDA_interface is the main function for GPU execution. This
- * implementation uses compact binning to distribute input elements
- * into unit-cubed sized bins. The bins are then visited by GPU
- * threads, where every thread computes the value of one (or small set)
- * of output elements by computing the contributions of elements in 
- * neighboring bins to these output elements.
- *
- * The bins have a limited bin size and everything beyond that bin size
- * is offloaded to the CPU to be computed in parallel with the GPU
- * gridding.
- ***********************************************************************/
-void CUDA_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-){
-
-  /* Initializing all variables */
-  dim3 dims (8,4,2); //size of a gridding block on the GPU
-
-  /* x, y, z dimensions of the output grid (gridData) */
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-  int size_xy = size_y*size_x;
-
-  int gridNumElems = size_x * size_y * size_z;  // Total number of grid points
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  float cutoff = float(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  /* Declarations of device data structures */
-  ReconstructionSample* sample_d = NULL;    // Device array for original input array
-  ReconstructionSample* sortedSample_d = NULL;             // Device array of the sorted (into bins) input elements.
-                                            // This array is accessed by sortedSampleSoA_d in a structure
-                                            //   of arrays manner.
-  float2* gridData_d = NULL;                // Device array for output grid
-  float* sampleDensity_d = NULL;            // Device array for output sample density
-  unsigned int* idxKey_d = NULL;            // Array of bin indeces generated in the binning kernel
-                                            //   and used to sort the input elements into their
-                                            //   corresponding bins
-  unsigned int* idxValue_d = NULL;          // This array holds the indices of input elements in the
-                                            //   the original array. This array is sorted using the
-                                            //   the idxKey_d array, and once sorted, it is used in
-                                            //   the reorder kernel to move the actual elements into
-                                            //   their corresponding bins.
-  unsigned int* binCount_d = NULL;          // Zero-initialized array which counts the number of elements
-                                            //   put in each bin. Based on this array, we determine which
-                                            //   elements get offloaded to the CPU
-  unsigned int* binStartAddr_d = NULL;      // Array of start offset of each of the compact bins
-
-  /* Allocating device memory */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaMalloc((void**)&sortedSample_d, n*sizeof(ReconstructionSample));
-  cudaMalloc((void**)&binStartAddr_d, (gridNumElems+1)*sizeof(unsigned int));
-  cudaMalloc((void**)&sample_d, n*sizeof(ReconstructionSample));
-  cudaMalloc((void**)&idxKey_d, (((n+3)/4)*4)*sizeof(unsigned int));   //Pad to nearest multiple of 4 to 
-  cudaMalloc((void**)&idxValue_d, (((n+3)/4)*4)*sizeof(unsigned int)); //satisfy a property of the sorting kernel.
-
-/*The CUDPP library features highly optimizes implementations for radix sort
-  and prefix sum. However for portability reasons, we implemented our own,
-  slightly less optimized versions of these operations. When performing
-  prefix sum using CUDPP, the output array has to be different from the input
-  array, which is why we would allocate an array for binCount_d. For our
-  implementation, we allow the input and output arrays to be the same,
-  therefore we reuse the binCount_d array to get the starting offset of each
-  bin. */
-#if USE_CUDPP
-  cudaMalloc((void**)&binCount_d, (gridNumElems+1)*sizeof(unsigned int));
-#else
-  binCount_d = binStartAddr_d;
-#endif
-  CUERR;
-
-  /* Transfering data from Host to Device */
-  cudaMemcpyToSymbol(cutoff2_c, &cutoff2, sizeof(float), 0);
-  cudaMemcpyToSymbol(cutoff_c, &cutoff, sizeof(float), 0);
-  cudaMemcpyToSymbol(gridSize_c, params.gridSize, 3*sizeof(int), 0);
-  cudaMemcpyToSymbol(size_xy_c, &size_xy, sizeof(int), 0);
-  cudaMemcpyToSymbol(_1overCutoff2_c, &_1overCutoff2, sizeof(float), 0);
-  cudaMemcpy(sample_d, sample, n*sizeof(ReconstructionSample), cudaMemcpyHostToDevice);
-  cudaMemset(binCount_d, 0, (gridNumElems+1)*sizeof(unsigned int));
-
-  // Initialize padding to max integer value, so that when sorted,
-  // these elements get pushed to the end of the array.
-  cudaMemset(idxKey_d+n, 0xFF, (((n+3)&~(3))-n)*sizeof(unsigned int));
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 1: Perform binning. This kernel determines which output bin each input element
-   * goes into. Any excess (beyond binsize) is put in the CPU bin
-   */
-  dim3 block1 (BLOCKSIZE);
-  dim3 grid1 ((n+BLOCKSIZE-1)/BLOCKSIZE);
-
-  binning_kernel<<<grid1, block1>>>(n, sample_d, idxKey_d, idxValue_d, binCount_d, params.binsize, gridNumElems);
-
-  /* STEP 2: Sort the index-value pair generate in the binning kernel */
-#if USE_CUDPP
-  CUDPPConfiguration config;
-  config.datatype = CUDPP_UINT;
-  config.algorithm = CUDPP_SORT_RADIX;
-  config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
-
-  CUDPPHandle sortplan = 0;
-  CUDPPResult result = cudppPlan(&sortplan, config, n, 1, 0);
-
-  int precision = 0;
-  int numElems = gridNumElems;
-  while (numElems > 0){
-    numElems >>= 1;
-    precision++;
-  }
-
-  cudppSort(sortplan, idxKey_d, idxValue_d, int(precision), n);
-  result = cudppDestroyPlan(sortplan);
-#else
-  sort(n, gridNumElems+1, idxKey_d, idxValue_d);
-#endif
-
-  /* STEP 3: Reorder the input data, based on the sorted values from Step 2.
-   * this step also involves changing the data from array of structs to a struct
-   * of arrays. Also in this kernel, we populate an array with the starting index
-   * of every output bin features in the input array, based on the sorted indices 
-   * from Step 2.
-   * At the end of this step, we copy the start address and list of input elements
-   * that will be computed on the CPU.
-   */
-  reorder_kernel<<<grid1,block1>>>(n, idxValue_d, sample_d, sortedSample_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaFree(idxValue_d);
-  cudaFree(idxKey_d);
-  cudaFree(sample_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 4: In this step we generate the ADD scan of the array of starting indices
-   * of the output bins. The result is an array that contains the starting address of
-   * every output bin.
-   */
-#if USE_CUDPP
-  config.datatype = CUDPP_UINT;
-  config.algorithm = CUDPP_SCAN;
-  config.options = CUDPP_OPTION_EXCLUSIVE;
-  config.op=CUDPP_ADD;
-
-  CUDPPHandle scanplan = 0;
-  result = cudppPlan(&scanplan, config, gridNumElems+1, 1, 0);
-
-  cudppScan(scanplan, binCount_d, binStartAddr_d, gridNumElems+1);
-  result = cudppDestroyPlan(scanplan);
-#else
-  scanLargeArray(gridNumElems+1, binCount_d);
-#endif
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  // Copy back to the CPU the indices of the input elements that will be processed on the CPU
-  int cpuStart;
-  cudaMemcpy(&cpuStart, binCount_d+gridNumElems, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-
-  int CPUbin_size = int(n)-int(cpuStart);
-
-  ReconstructionSample* CPUbin;
-  cudaMallocHost((void**)&CPUbin,CPUbin_size*sizeof(ReconstructionSample));
-  cudaMemcpy(CPUbin, sortedSample_d+cpuStart, CPUbin_size*sizeof(ReconstructionSample), cudaMemcpyDeviceToHost);
-
-#if USE_CUDPP
-  cudaFree(binCount_d);
-#endif
-
-  /* STEP 5: Perform the binning on the GPU. The results are computed in a gather fashion
-   * where each thread computes the value of one output element by reading the relevant
-   * bins.
-   */
-  cudaMalloc((void**)&gridData_d, gridNumElems*sizeof(float2));
-  cudaMalloc((void**)&sampleDensity_d, gridNumElems*sizeof(float));
-  CUERR;
-
-  cudaMemset(gridData_d, 0, gridNumElems*sizeof(float2));
-  cudaMemset(sampleDensity_d, 0, gridNumElems*sizeof(float));
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  dim3 block2 (dims.x,dims.y,dims.z);
-  dim3 grid2 (size_x/dims.x, (size_y*size_z)/(dims.y*dims.z));
-
-  gridding_GPU<<<grid2, block2>>>(sortedSample_d, binStartAddr_d, gridData_d, sampleDensity_d, beta);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  /* Copying the results from the Device to the Host */
-  cudaMemcpy(sampleDensity, sampleDensity_d, gridNumElems*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(gridData, gridData_d, gridNumElems*sizeof(float2),cudaMemcpyDeviceToHost);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  /* STEP 6: Computing the contributions of the sample points handled by the Host
-   * and adding those to the GPU results.
-   */
-  gridding_Gold(CPUbin_size, params, CPUbin, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaFreeHost(CPUbin);
-  cudaFree(gridData_d);
-  cudaFree(sampleDensity_d);
-  cudaFree(binCount_d);
-  cudaFree(sortedSample_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_NONE);
-
-  return;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.h
deleted file mode 100644
index 401759f186..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/CUDA_interface.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void CUDA_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/GPU_kernels.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/GPU_kernels.cu
deleted file mode 100644
index afd0a3d69f..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/GPU_kernels.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define TILE 64
-#define LOG_TILE 6
-
-__constant__ float cutoff2_c;
-__constant__ float cutoff_c;
-__constant__ int gridSize_c[3];
-__constant__ int size_xy_c;
-__constant__ float _1overCutoff2_c;
-
-__global__ void binning_kernel (unsigned int n, ReconstructionSample* sample_g, unsigned int* idxKey_g,
-                                unsigned int* idxValue_g, unsigned int* binCount_g, unsigned int binsize, unsigned int gridNumElems){
-  unsigned int key;
-  unsigned int sampleIdx = blockIdx.x*blockDim.x+threadIdx.x;
-  ReconstructionSample pt;
-  unsigned int binIdx;
-  unsigned int count;
-
-  if (sampleIdx < n){
-    pt = sample_g[sampleIdx];
-
-    binIdx = (unsigned int)(pt.kZ)*size_xy_c + (unsigned int)(pt.kY)*gridSize_c[0] + (unsigned int)(pt.kX);
-
-    count = atomicAdd(binCount_g+binIdx, 1);
-    if (count < binsize){
-      key = binIdx;
-    } else {
-      atomicSub(binCount_g+binIdx, 1);
-      key = gridNumElems;
-    }
-
-    idxKey_g[sampleIdx] = key;
-    idxValue_g[sampleIdx] = sampleIdx;
-  }
-}
-
-__global__ void reorder_kernel(int n, unsigned int* idxValue_g, ReconstructionSample* samples_g, ReconstructionSample* sortedSample_g){
-  unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;
-  unsigned int old_index;
-  ReconstructionSample pt;
-
-  if (index < n){
-    old_index = idxValue_g[index];
-    pt = samples_g[old_index];
-    sortedSample_g[index] = pt;
-  }
-}
-
-__device__ float kernel_value(float v){
-
-  float rValue = 0;
-
-  float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-                (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-                 0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-                 0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-                 0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-                 0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-                 0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-                 0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-                 0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = __fdividef(-num,den);
-
-  return rValue;
-}
-
-__global__ void gridding_GPU (ReconstructionSample* sample_g, unsigned int* binStartAddr_g, float2* gridData_g, float* sampleDensity_g, float beta){
-  __shared__ ReconstructionSample sharedBin[TILE];
-
-  const int flatIdx = threadIdx.z*blockDim.y*blockDim.x+threadIdx.y*blockDim.x+threadIdx.x;
-
-  // figure out starting point of the tile
-  const int z0 = blockDim.z*(blockIdx.y/(gridSize_c[1]/blockDim.y));
-  const int y0 = blockDim.y*(blockIdx.y%(gridSize_c[1]/blockDim.y));
-  const int x0 = blockIdx.x*blockDim.x;
-
-  const int X  = x0+threadIdx.x;
-  const int Y  = y0+threadIdx.y;
-  const int Z  = z0+threadIdx.z;
-
-  const int xl = x0-ceil(cutoff_c);
-  const int xL = (xl < 0) ? 0 : xl;
-  const int xh = x0+blockDim.x+cutoff_c;
-  const int xH = (xh >= gridSize_c[0]) ? gridSize_c[0]-1 : xh;
-
-  const int yl = y0-ceil(cutoff_c);
-  const int yL = (yl < 0) ? 0 : yl;
-  const int yh = y0+blockDim.y+cutoff_c;
-  const int yH = (yh >= gridSize_c[1]) ? gridSize_c[1]-1 : yh;
-
-  const int zl = z0-ceil(cutoff_c);
-  const int zL = (zl < 0) ? 0 : zl;
-  const int zh = z0+blockDim.z+cutoff_c;
-  const int zH = (zh >= gridSize_c[2]) ? gridSize_c[2]-1 : zh;
-
-  const int idx = Z*size_xy_c + Y*gridSize_c[0] + X;
-
-  float2 pt;
-  pt.x = 0.0;
-  pt.y = 0.0;
-  float density = 0.0;
-
-  for (int z = zL; z <= zH; z++){
-    for (int y = yL; y <= yH; y++){
-      const unsigned int *addr = binStartAddr_g+z*size_xy_c+ y*gridSize_c[0];
-      const unsigned int start = *(addr+xL);
-      const unsigned int end   = *(addr+xH+1);
-      const unsigned int delta = end-start;
-      for (int x = 0; x < ((delta+TILE-1)>>LOG_TILE); x++){
-        int tileSize = ((delta-(x<<LOG_TILE)) > TILE) ? TILE : (delta-(x<<LOG_TILE));
-        int globalIdx = flatIdx+(x<<LOG_TILE);
-        __syncthreads();
-        if(flatIdx < tileSize){
-          sharedBin[flatIdx] = sample_g[start+globalIdx];
-        }
-        __syncthreads();
-
-        for (int j=0; j< tileSize; j++){
-          const float real = sharedBin[j].real;
-          const float imag = sharedBin[j].imag;
-          const float sdc = sharedBin[j].sdc;
-
-          if((real != 0.0 || imag != 0.0) && sdc != 0.0){
-            float v = (sharedBin[j].kX-X)*(sharedBin[j].kX-X);
-            v += (sharedBin[j].kY-Y)*(sharedBin[j].kY-Y);
-            v += (sharedBin[j].kZ-Z)*(sharedBin[j].kZ-Z);
-            if(v<cutoff2_c){
-              const float w = kernel_value(beta*sqrtf(1.0-(v*_1overCutoff2_c))) *sdc;
-              pt.x += w*real;
-              pt.y += w*imag;
-              density += 1.0;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  gridData_g[idx] = pt;
-  sampleDensity_g[idx] = density;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/Makefile
deleted file mode 100644
index 670cc5a632..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=main.o CPU_kernels.o CUDA_interface.o scanLargeArray.o sort.o
-
-APP_CUDACFLAGS=-arch=compute_20
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/main.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/main.cu
deleted file mode 100644
index 95f393a35f..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/main.cu
+++ /dev/null
@@ -1,245 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <cuda.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "CUDA_interface.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-#define CUERR \
-  do { \
-    cudaError_t err; \
-    if ((err = cudaGetLastError()) != cudaSuccess) { \
-      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-      return 0; \
-    } \
-  } while (0)
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-  printf("  Total amount of GPU memory: %llu bytes\n", (unsigned long long) deviceProp.totalGlobalMem);
-  printf("  Number of samples = %d\n", p->numSamples);
-  if (p->numSamples > 10000000 && deviceProp.totalGlobalMem/1024/1024 < 3000) {
-    printf("  Need at least 3GB of GPU memory for large dataset\n");
-    exit(1);
-  }
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = float(params.aquisitionMatrixSize[0])/(float(params.reconstructionMatrixSize[0])*float(params.kMax[0]));
-  kScale[1] = float(params.aquisitionMatrixSize[1])/(float(params.reconstructionMatrixSize[1])*float(params.kMax[1]));
-  kScale[2] = float(params.aquisitionMatrixSize[2])/(float(params.reconstructionMatrixSize[2])*float(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  for(int n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples; //Input Data
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  cmplx* gridData; //Output Data
-  float* sampleDensity; //Output Data
-
-  cmplx* gridData_gold; //Gold Output Data
-  float* sampleDensity_gold; //Gold Output Data
-
-  cudaMallocHost((void**)&samples, params.numSamples*sizeof(ReconstructionSample));
-  CUERR;
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate memory for input data\n");
-    exit(1);
-  }
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
-  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
-  if (sampleDensity_gold == NULL || gridData_gold == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running gold version\n");
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);
-
-  cudaMallocHost((void**)&gridData, gridNumElems*sizeof(cmplx));
-  cudaMallocHost((void**)&sampleDensity, gridNumElems*sizeof(float));
-  CUERR;
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running CUDA version\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  //Interface function to GPU implementation of gridding
-  CUDA_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  int passed=1;
-  for (int i=0; i<gridNumElems; i++){
-    if(sampleDensity[i] != sampleDensity_gold[i]) {
-      passed=0;
-      break;
-    }
-  }
-  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  cudaFreeHost(samples);
-  cudaFreeHost(gridData);
-  cudaFreeHost(sampleDensity);
-  free(gridData_gold);
-  free(sampleDensity_gold);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.cu
deleted file mode 100644
index 1066d5982c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.cu
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#define BLOCK_SIZE 1024
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernels
-////////////////////////////////////////////////////////////////////////////////
-__global__ void scan_L1_kernel(unsigned int n, unsigned int* data, unsigned int* inter)
-{
-    __shared__ unsigned int s_data[EXPANDED_SIZE(BLOCK_SIZE)]; 
-
-    unsigned int thid = threadIdx.x;
-    unsigned int g_ai = blockIdx.x*2*blockDim.x + threadIdx.x;
-    unsigned int g_bi = g_ai + blockDim.x;
-
-    unsigned int s_ai = thid;
-    unsigned int s_bi = thid + blockDim.x;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = (g_ai < n) ? data[g_ai] : 0;
-    s_data[s_bi] = (g_bi < n) ? data[g_bi] : 0;
-
-    unsigned int stride = 1;
-    for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-    {
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            s_data[bi] += s_data[ai];
-        }
-
-        stride *= 2;
-    }
-
-    if (thid == 0){
-        unsigned int last = blockDim.x*2 -1;
-        last += CONFLICT_FREE_OFFSET(last);
-        inter[blockIdx.x] = s_data[last];
-        s_data[last] = 0;
-    }
-
-    for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-    {
-        stride >>= 1;
-
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            unsigned int t  = s_data[ai];
-            s_data[ai] = s_data[bi];
-            s_data[bi] += t;
-        }
-    }
-    __syncthreads();
-
-    if (g_ai < n) { data[g_ai] = s_data[s_ai]; }
-    if (g_bi < n) { data[g_bi] = s_data[s_bi]; }
-}
-
-__global__ void scan_inter1_kernel(unsigned int* data, unsigned int iter)
-{
-    extern __shared__ unsigned int s_data[];
-
-    unsigned int thid = threadIdx.x;
-    unsigned int gthid = (blockIdx.x*blockDim.x + threadIdx.x);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = 1;
-    for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-    {
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-            s_data[bi] += s_data[ai];
-        }
-
-        stride *= 2;
-    }
-
-    __syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__global__ void scan_inter2_kernel(unsigned int* data, unsigned int iter)
-{
-    extern __shared__ unsigned int s_data[];
-
-    unsigned int thid = threadIdx.x;
-    unsigned int gthid = (blockIdx.x*blockDim.x + threadIdx.x);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = blockDim.x*2;
-
-    for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-    {
-        stride >>= 1;
-
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            unsigned int t  = s_data[ai];
-            s_data[ai] = s_data[bi];
-            s_data[bi] += t;
-        }
-    }
-    __syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__global__ void uniformAdd(unsigned int n, unsigned int *data, unsigned int *inter)
-{
-
-    __shared__ unsigned int uni;
-    if (threadIdx.x == 0) { uni = inter[blockIdx.x]; }
-    __syncthreads();
-
-    unsigned int g_ai = blockIdx.x*2*blockDim.x + threadIdx.x;
-    unsigned int g_bi = g_ai + blockDim.x;
-
-    if (g_ai < n) { data[g_ai] += uni; }
-    if (g_bi < n) { data[g_bi] += uni; }
-}
-
-void scanLargeArray( unsigned int gridNumElements, unsigned int* data_d) {
-    unsigned int gridNumElems = gridNumElements;    
-
-    // allocate device memory input and output arrays
-    unsigned int* inter_d = NULL;
-
-    // Run the prescan
-    unsigned int size = (gridNumElems+BLOCK_SIZE-1)/BLOCK_SIZE;
-
-    unsigned int dim_block;
-    unsigned int current_max = size*BLOCK_SIZE;
-    for (int block_size = 128; block_size <= BLOCK_SIZE; block_size *= 2){
-      unsigned int array_size = block_size;
-      while(array_size < size){
-        array_size *= block_size;
-      }
-      if (array_size <= current_max){
-        current_max = array_size;
-        dim_block = block_size;
-      }
-    }
-
-    cudaMalloc( (void**) &inter_d, current_max*sizeof(unsigned int));
-    cudaMemset (inter_d, 0, current_max*sizeof(unsigned int));
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++){
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*BLOCK_SIZE)) > (GRID_SIZE*BLOCK_SIZE)) ? (GRID_SIZE*BLOCK_SIZE) : (gridNumElems-(i*GRID_SIZE*BLOCK_SIZE));
-
-        dim3 block (BLOCK_SIZE/2);
-        dim3 grid (gridSize);
-        scan_L1_kernel<<<grid, block>>>(numElems, data_d+(i*GRID_SIZE*BLOCK_SIZE), inter_d+(i*GRID_SIZE));
-    }
-
-    unsigned int stride = 1;
-    for (unsigned int d = current_max; d > 1; d /= dim_block)
-    {
-        dim3 block (dim_block/2);
-        dim3 grid (d/dim_block);
-
-        scan_inter1_kernel<<<grid, block, EXPANDED_SIZE(dim_block)*sizeof(unsigned int)>>>(inter_d, stride);
-
-        stride *= dim_block;
-    }
-
-    cudaMemset(&(inter_d[current_max-1]), 0, sizeof(unsigned int));
-
-    for (unsigned int d = dim_block; d <= current_max; d *= dim_block)
-    {
-        stride /= dim_block;
-        dim3 block (dim_block/2);
-        dim3 grid (d/dim_block);
-
-        scan_inter2_kernel<<<grid, block, EXPANDED_SIZE(dim_block)*sizeof(unsigned int)>>>(inter_d, stride);
-    }
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++){
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*BLOCK_SIZE)) > (GRID_SIZE*BLOCK_SIZE)) ? (GRID_SIZE*BLOCK_SIZE) : (gridNumElems-(i*GRID_SIZE*BLOCK_SIZE));
-
-        dim3 block (BLOCK_SIZE/2);
-        dim3 grid (gridSize);
-
-        uniformAdd<<<grid, block>>>(numElems, data_d+(i*GRID_SIZE*BLOCK_SIZE), inter_d+(i*GRID_SIZE));
-    }
-
-    cudaFree(inter_d);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.h
deleted file mode 100644
index a413fd6b7c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/scanLargeArray.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void scanLargeArray(unsigned int gridNumElements, unsigned int* data_d);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.cu
deleted file mode 100644
index c071eb87fb..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.cu
+++ /dev/null
@@ -1,254 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <cuda.h>
-#include <stdio.h>
-
-#include "scanLargeArray.h"
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-#define CONFLICT_FREE_OFFSET(index) ((index) >> LNB + (index) >> (2*LNB))
-#define BLOCK_P_OFFSET (4*SORT_BS+1+(4*SORT_BS+1)/16+(4*SORT_BS+1)/64)
-
-__device__ void scan (unsigned int s_data[BLOCK_P_OFFSET]){
-  unsigned int thid = threadIdx.x;
-
-  __syncthreads();
-
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)] += s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))];
-
-  unsigned int stride = 2;
-  for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-  {
-    __syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  if (thid == 0){
-    unsigned int last = 4*blockDim.x-1;
-    last += CONFLICT_FREE_OFFSET(last);
-    s_data[4*blockDim.x+CONFLICT_FREE_OFFSET(4*blockDim.x)] = s_data[last];
-    s_data[last] = 0;
-  }
-
-  for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-  {
-    stride >>= 1;
-
-    __syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t  = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-  __syncthreads();
-
-  unsigned int temp = s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)] = s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)];
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += temp;
-
-  unsigned int temp2 = s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))];
-  s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))] = s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)];
-  s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)] += temp2;
-
-  __syncthreads();
-}
-
-__global__ static void splitSort(int numElems, int iter, unsigned int* keys, unsigned int* values, unsigned int* histo)
-{
-    __shared__ unsigned int flags[BLOCK_P_OFFSET];
-    __shared__ unsigned int histo_s[1<<BITS];
-
-    const unsigned int tid = threadIdx.x;
-    const unsigned int gid = blockIdx.x*4*SORT_BS+4*threadIdx.x;
-
-    // Copy input to shared mem. Assumes input is always even numbered
-    uint4 lkey = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
-    uint4 lvalue;
-    if (gid < numElems){
-      lkey = *((uint4*)(keys+gid));
-      lvalue = *((uint4*)(values+gid));
-    }
-
-    if(tid < (1<<BITS)){
-      histo_s[tid] = 0;
-    }
-    __syncthreads();
-
-    atomicAdd(histo_s+((lkey.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-
-    uint4 index = {4*tid, 4*tid+1, 4*tid+2, 4*tid+3};
-
-    for (int i=BITS*iter; i<BITS*(iter+1);i++){
-      const uint4 flag = {(lkey.x>>i)&0x1,(lkey.y>>i)&0x1,(lkey.z>>i)&0x1,(lkey.w>>i)&0x1};
-
-      flags[index.x+CONFLICT_FREE_OFFSET(index.x)] = 1<<(16*flag.x);
-      flags[index.y+CONFLICT_FREE_OFFSET(index.y)] = 1<<(16*flag.y);
-      flags[index.z+CONFLICT_FREE_OFFSET(index.z)] = 1<<(16*flag.z);
-      flags[index.w+CONFLICT_FREE_OFFSET(index.w)] = 1<<(16*flag.w);
-
-      scan (flags);
-
-      index.x = (flags[index.x+CONFLICT_FREE_OFFSET(index.x)]>>(16*flag.x))&0xFFFF;
-      index.y = (flags[index.y+CONFLICT_FREE_OFFSET(index.y)]>>(16*flag.y))&0xFFFF;
-      index.z = (flags[index.z+CONFLICT_FREE_OFFSET(index.z)]>>(16*flag.z))&0xFFFF;
-      index.w = (flags[index.w+CONFLICT_FREE_OFFSET(index.w)]>>(16*flag.w))&0xFFFF;
-
-      unsigned short offset = flags[4*blockDim.x+CONFLICT_FREE_OFFSET(4*blockDim.x)]&0xFFFF;
-      index.x += (flag.x) ? offset : 0;
-      index.y += (flag.y) ? offset : 0;
-      index.z += (flag.z) ? offset : 0;
-      index.w += (flag.w) ? offset : 0;
-
-      __syncthreads();
-    }
-
-    // Write result.
-    if (gid < numElems){
-      keys[blockIdx.x*4*SORT_BS+index.x] = lkey.x;
-      keys[blockIdx.x*4*SORT_BS+index.y] = lkey.y;
-      keys[blockIdx.x*4*SORT_BS+index.z] = lkey.z;
-      keys[blockIdx.x*4*SORT_BS+index.w] = lkey.w;
-
-      values[blockIdx.x*4*SORT_BS+index.x] = lvalue.x;
-      values[blockIdx.x*4*SORT_BS+index.y] = lvalue.y;
-      values[blockIdx.x*4*SORT_BS+index.z] = lvalue.z;
-      values[blockIdx.x*4*SORT_BS+index.w] = lvalue.w;
-    }
-    if (tid < (1<<BITS)){
-      histo[gridDim.x*threadIdx.x+blockIdx.x] = histo_s[tid];
-    }
-}
-
-__global__ void splitRearrange (int numElems, int iter, unsigned int* keys_i, unsigned int* keys_o, unsigned int* values_i, unsigned int* values_o, unsigned int* histo){
-  __shared__ unsigned int histo_s[(1<<BITS)];
-  __shared__ unsigned int array_s[4*SORT_BS];
-  int index = blockIdx.x*4*SORT_BS + 4*threadIdx.x;
-
-  if (threadIdx.x < (1<<BITS)){
-    histo_s[threadIdx.x] = histo[gridDim.x*threadIdx.x+blockIdx.x];
-  }
-
-  uint4 mine, value;
-  if (index < numElems){
-    mine = *((uint4*)(keys_i+index));
-    value = *((uint4*)(values_i+index));
-  } else {
-    mine.x = UINT32_MAX;
-    mine.y = UINT32_MAX;
-    mine.z = UINT32_MAX;
-    mine.w = UINT32_MAX;
-  }
-  uint4 masks = {(mine.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)};
-
-  ((uint4*)array_s)[threadIdx.x] = masks;
-  __syncthreads();
-
-  uint4 new_index = {histo_s[masks.x],histo_s[masks.y],histo_s[masks.z],histo_s[masks.w]};
-
-  int i = 4*threadIdx.x-1;
-  while (i >= 0){
-    if (array_s[i] == masks.x){
-      new_index.x++;
-      i--;
-    } else {
-      break;
-    }
-  }
-
-  new_index.y = (masks.y == masks.x) ? new_index.x+1 : new_index.y;
-  new_index.z = (masks.z == masks.y) ? new_index.y+1 : new_index.z;
-  new_index.w = (masks.w == masks.z) ? new_index.z+1 : new_index.w;
-
-  if (index < numElems){
-    keys_o[new_index.x] = mine.x;
-    values_o[new_index.x] = value.x;
-
-    keys_o[new_index.y] = mine.y;
-    values_o[new_index.y] = value.y;
-
-    keys_o[new_index.z] = mine.z;
-    values_o[new_index.z] = value.z;
-
-    keys_o[new_index.w] = mine.w;
-    values_o[new_index.w] = value.w;
-  }
-}
-
-void sort (int numElems, unsigned int max_value, unsigned int* &dkeys, unsigned int* &dvalues){
-  dim3 grid ((numElems+4*SORT_BS-1)/(4*SORT_BS));
-  dim3 block (SORT_BS);
-
-  unsigned int iterations = 0;
-  while(max_value > 0){
-    max_value >>= BITS;
-    iterations++;
-  }
-
-  unsigned int *dhisto;
-  unsigned int *dkeys_o, *dvalues_o;
-
-  cudaMalloc((void**)&dhisto, (1<<BITS)*grid.x*sizeof(unsigned int));
-  cudaMalloc((void**)&dkeys_o, numElems*sizeof(unsigned int));
-  cudaMalloc((void**)&dvalues_o, numElems*sizeof(unsigned int));
-
-  for (int i=0; i<iterations; i++){
-    splitSort<<<grid,block>>>(numElems, i, dkeys, dvalues, dhisto);
-
-    scanLargeArray(grid.x*(1<<BITS), dhisto);
-
-    splitRearrange<<<grid,block>>>(numElems, i, dkeys, dkeys_o, dvalues, dvalues_o, dhisto);
-
-    unsigned int* temp = dkeys;
-    dkeys = dkeys_o;
-    dkeys_o = temp;
-
-    temp = dvalues;
-    dvalues = dvalues_o;
-    dvalues_o = temp;
-  }
-
-  cudaFree(dkeys_o);
-  cudaFree(dvalues_o);
-  cudaFree(dhisto);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.h
deleted file mode 100644
index 3d150bb2fb..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda-base/sort.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void sort (int numElems, unsigned int maxValue, unsigned int* &dkeys, unsigned int* &dvalues);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.c
deleted file mode 100644
index 87f4c0cbec..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size, 
-		 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]){
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  int gridNumElems = size_x*size_y*size_z;
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int pos = 0;
-  int* posArray = (int*) malloc (gridNumElems*sizeof(int));
-  memset(posArray, 0xFF, gridNumElems*sizeof(int));
-  (*indeces) = (int*) malloc (gridNumElems*sizeof(int));
-  (*gridData) = (cmplx*) calloc (gridNumElems,sizeof(cmplx));
-  (*sampleDensity) = (float*) calloc (gridNumElems,sizeof(float));
-
-  if (*gridData == NULL || *sampleDensity == NULL || *indeces == NULL){
-    printf("unable to allocate temporary CPU space\n");
-    exit(1);
-  }
-
-  int i;
-  for (i=0; i < CPUbin_size; i++){
-    ReconstructionSample pt = sample[CPUbin[i]];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* kernel weighting value */
-                  if (params.useLUT){
-                    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-                  } else {
-                    w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-                  }
-
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* grid data */
-                  if(posArray[idx] == -1){
-                    posArray[idx] = pos;
-                    (*indeces)[pos] = idx;
-                    pos++;
-                  }
-
-                  (*gridData)[posArray[idx]].real += (w*pt.real);
-                  (*gridData)[posArray[idx]].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  (*sampleDensity)[posArray[idx]] += 1;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  free(posArray);
-  return pos;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.h
deleted file mode 100644
index 1d883f00f7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CPU_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size,
-                 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.cu
deleted file mode 100644
index 7142565dc8..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.cu
+++ /dev/null
@@ -1,308 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "scanLargeArray.h"
-#include "GPU_kernels.cu"
-#include "CPU_kernels.h"
-
-#define USE_CUDPP 0
-#if USE_CUDPP
-#include "cudpp.h"
-#else
-#include "sort.h"
-#include "scanLargeArray.h"
-#endif
-
-#define BLOCKSIZE 512
-#define PI 3.14159265359
-
-#define CUERR \
-  do { \
-    cudaError_t err; \
-    if ((err = cudaGetLastError()) != cudaSuccess) { \
-      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-      return; \
-    } \
-  } while (0)
-
-
-// Compare function used for Qsort for CPU computation
-int compare (const void * a, const void * b)
-{
-  return ( *(int*)a - *(int*)b );
-}
-
-/***********************************************************************
- * CUDA_interface is the main function for GPU execution. This
- * implementation uses compact binning to distribute input elements
- * into unit-cubed sized bins. The bins are then visited by GPU
- * threads, where every thread computes the value of one (or small set)
- * of output elements by computing the contributions of elements in 
- * neighboring bins to these output elements.
- *
- * The bins have a limited bin size and everything beyond that bin size
- * is offloaded to the CPU to be computed in parallel with the GPU
- * gridding.
- ***********************************************************************/
-void CUDA_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-){
-
-  /* Initializing all variables */
-  dim3 dims (8,4,2); //size of a gridding block on the GPU
-
-  /* x, y, z dimensions of the output grid (gridData) */
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-  int size_xy = size_y*size_x;
-
-  int gridNumElems = size_x * size_y * size_z;  // Total number of grid points
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  float cutoff = float(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  // Padding used to align the structure of arrays used for the sorted input elements
-  int npad = 0;
-  if (n % 64 != 0){
-    npad = 64 - (n%64);
-  }
-
-  /* Declarations of host data structures */
-  cmplx* gridData_CPU;
-  float* sampleDensity_CPU;
-  int* indices_CPU;
-
-  /* Declarations of device data structures */
-  ReconstructionSample* sample_d = NULL;    // Device array for original input array
-  float* sortedSample_d = NULL;             // Device array of the sorted (into bins) input elements.
-                                            // This array is accessed by sortedSampleSoA_d in a structure
-                                            //   of arrays manner.
-  float2* gridData_d = NULL;                // Device array for output grid
-  float* sampleDensity_d = NULL;            // Device array for output sample density
-  unsigned int* idxKey_d = NULL;            // Array of bin indeces generated in the binning kernel
-                                            //   and used to sort the input elements into their
-                                            //   corresponding bins
-  unsigned int* idxValue_d = NULL;          // This array holds the indices of input elements in the
-                                            //   the original array. This array is sorted using the
-                                            //   the idxKey_d array, and once sorted, it is used in
-                                            //   the reorder kernel to move the actual elements into
-                                            //   their corresponding bins.
-  sampleArrayStruct sortedSampleSoA_d;      // Structure of Arrays which holds the sorted input elements.
-                                            //   Uses sortedSample_d as the underlying physical data
-                                            //   structures
-  unsigned int* binCount_d = NULL;          // Zero-initialized array which counts the number of elements
-                                            //   put in each bin. Based on this array, we determine which
-                                            //   elements get offloaded to the CPU
-  unsigned int* binStartAddr_d = NULL;      // Array of start offset of each of the compact bins
-
-  /* Allocating device memory */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaMalloc((void**)&sortedSample_d, (n+npad)*sizeof(ReconstructionSample));
-  cudaMalloc((void**)&binStartAddr_d, (gridNumElems+1)*sizeof(unsigned int));
-  cudaMalloc((void**)&sample_d, n*sizeof(ReconstructionSample));
-  cudaMalloc((void**)&idxKey_d, (((n+3)/4)*4)*sizeof(unsigned int));   //Pad to nearest multiple of 4 to 
-  cudaMalloc((void**)&idxValue_d, (((n+3)/4)*4)*sizeof(unsigned int)); //satisfy a property of the sorting kernel.
-
-/*The CUDPP library features highly optimizes implementations for radix sort
-  and prefix sum. However for portability reasons, we implemented our own,
-  slightly less optimized versions of these operations. When performing
-  prefix sum using CUDPP, the output array has to be different from the input
-  array, which is why we would allocate an array for binCount_d. For our
-  implementation, we allow the input and output arrays to be the same,
-  therefore we reuse the binCount_d array to get the starting offset of each
-  bin. */
-#if USE_CUDPP
-  cudaMalloc((void**)&binCount_d, (gridNumElems+1)*sizeof(unsigned int));
-#else
-  binCount_d = binStartAddr_d;
-#endif
-  CUERR;
-
-  /* Transfering data from Host to Device */
-  cudaMemcpyToSymbol(cutoff2_c, &cutoff2, sizeof(float), 0);
-  cudaMemcpyToSymbol(cutoff_c, &cutoff, sizeof(float), 0);
-  cudaMemcpyToSymbol(gridSize_c, params.gridSize, 3*sizeof(int), 0);
-  cudaMemcpyToSymbol(size_xy_c, &size_xy, sizeof(int), 0);
-  cudaMemcpyToSymbol(_1overCutoff2_c, &_1overCutoff2, sizeof(float), 0);
-  cudaMemcpy(sample_d, sample, n*sizeof(ReconstructionSample), cudaMemcpyHostToDevice);
-  cudaMemset(binCount_d, 0, (gridNumElems+1)*sizeof(unsigned int));
-
-  // Initialize padding to max integer value, so that when sorted,
-  // these elements get pushed to the end of the array.
-  cudaMemset(idxKey_d+n, 0xFF, (((n+3)&~(3))-n)*sizeof(unsigned int));
-
-  sortedSampleSoA_d.data = (float2*)(sortedSample_d);
-  sortedSampleSoA_d.loc = (float4*)(sortedSample_d+2*(n+npad));
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 1: Perform binning. This kernel determines which output bin each input element
-   * goes into. Any excess (beyond binsize) is put in the CPU bin
-   */
-  dim3 block1 (BLOCKSIZE);
-  dim3 grid1 ((n+BLOCKSIZE-1)/BLOCKSIZE);
-
-  binning_kernel<<<grid1, block1>>>(n, sample_d, idxKey_d, idxValue_d, binCount_d, params.binsize, gridNumElems);
-
-  /* STEP 2: Sort the index-value pair generate in the binning kernel */
-#if USE_CUDPP
-  CUDPPConfiguration config;
-  config.datatype = CUDPP_UINT;
-  config.algorithm = CUDPP_SORT_RADIX;
-  config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
-
-  CUDPPHandle sortplan = 0;
-  CUDPPResult result = cudppPlan(&sortplan, config, n, 1, 0);
-
-  int precision = 0;
-  int numElems = gridNumElems;
-  while (numElems > 0){
-    numElems >>= 1;
-    precision++;
-  }
-
-  cudppSort(sortplan, idxKey_d, idxValue_d, int(precision), n);
-  result = cudppDestroyPlan(sortplan);
-#else
-  sort(n, gridNumElems+1, idxKey_d, idxValue_d);
-#endif
-
-  /* STEP 3: Reorder the input data, based on the sorted values from Step 2.
-   * this step also involves changing the data from array of structs to a struct
-   * of arrays. Also in this kernel, we populate an array with the starting index
-   * of every output bin features in the input array, based on the sorted indices 
-   * from Step 2.
-   * At the end of this step, we copy the start address and list of input elements
-   * that will be computed on the CPU.
-   */
-  reorder_kernel<<<grid1,block1>>>(n, idxValue_d, sample_d, sortedSampleSoA_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaFree(idxKey_d);
-  cudaFree(sample_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 4: In this step we generate the ADD scan of the array of starting indices
-   * of the output bins. The result is an array that contains the starting address of
-   * every output bin.
-   */
-#if USE_CUDPP
-  config.datatype = CUDPP_UINT;
-  config.algorithm = CUDPP_SCAN;
-  config.options = CUDPP_OPTION_EXCLUSIVE;
-  config.op=CUDPP_ADD;
-
-  CUDPPHandle scanplan = 0;
-  result = cudppPlan(&scanplan, config, gridNumElems+1, 1, 0);
-
-  cudppScan(scanplan, binCount_d, binStartAddr_d, gridNumElems+1);
-  result = cudppDestroyPlan(scanplan);
-#else
-  scanLargeArray(gridNumElems+1, binCount_d);
-#endif
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  // Copy back to the CPU the indices of the input elements that will be processed on the CPU
-  int cpuStart;
-  cudaMemcpy(&cpuStart, binCount_d+gridNumElems, sizeof(unsigned int), cudaMemcpyDeviceToHost);
-
-  int CPUbin_size = int(n)-int(cpuStart);
-
-  int* CPUbin;
-  cudaMallocHost((void**)&CPUbin,CPUbin_size*sizeof(unsigned int));
-  cudaMemcpy(CPUbin, idxValue_d+cpuStart, CPUbin_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);
-
-  cudaFree(idxValue_d);
-#if USE_CUDPP
-  cudaFree(binCount_d);
-#endif
-
-  /* STEP 5: Perform the binning on the GPU. The results are computed in a gather fashion
-   * where each thread computes the value of one output element by reading the relevant
-   * bins.
-   */
-  cudaMalloc((void**)&gridData_d, gridNumElems*sizeof(float2));
-  cudaMalloc((void**)&sampleDensity_d, gridNumElems*sizeof(float));
-  CUERR;
-
-  cudaMemset(gridData_d, 0, gridNumElems*sizeof(float2));
-  cudaMemset(sampleDensity_d, 0, gridNumElems*sizeof(float));
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  dim3 block2 (dims.x,dims.y,dims.z);
-  dim3 grid2 (size_x/dims.x, (size_y*size_z)/(4*dims.y*dims.z));
-
-  gridding_GPU<<<grid2, block2>>>(sortedSampleSoA_d, binStartAddr_d, gridData_d, sampleDensity_d, beta);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  qsort(CPUbin, CPUbin_size, sizeof(int), compare); //Sorting helps cache locality of input element array
-  int num = gridding_CPU(n, params, sample, CPUbin, CPUbin_size, LUT, sizeLUT, &gridData_CPU, &sampleDensity_CPU, &indices_CPU);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  /* Copying the results from the Device to the Host */
-  cudaMemcpy(sampleDensity, sampleDensity_d, gridNumElems*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(gridData, gridData_d, gridNumElems*sizeof(float2),cudaMemcpyDeviceToHost);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  /* STEP 6: Computing the contributions of the sample points handled by the Host
-   * and adding those to the GPU results.
-   */
-  for (int i=0; i< num; i++){
-    gridData[indices_CPU[i]].real += gridData_CPU[i].real;
-    gridData[indices_CPU[i]].imag += gridData_CPU[i].imag;
-    sampleDensity[indices_CPU[i]] += sampleDensity_CPU[i];
-  }
-
-  if (gridData_CPU != NULL){
-    free(indices_CPU);
-    free(gridData_CPU);
-    free(sampleDensity_CPU);
-  }
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  cudaFreeHost(CPUbin);
-  cudaFree(gridData_d);
-  cudaFree(sampleDensity_d);
-  cudaFree(binCount_d);
-  cudaFree(sortedSample_d);
-
-  pb_SwitchToTimer(timers, pb_TimerID_NONE);
-
-  return;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.h
deleted file mode 100644
index 401759f186..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/CUDA_interface.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void CUDA_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/GPU_kernels.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/GPU_kernels.cu
deleted file mode 100644
index 4285b3e26b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/GPU_kernels.cu
+++ /dev/null
@@ -1,252 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define TILE 64
-#define LOG_TILE 6
-
-typedef struct{
-  float2* data;
-  float4* loc;
-} sampleArrayStruct;
-
-__constant__ float cutoff2_c;
-__constant__ float cutoff_c;
-__constant__ int gridSize_c[3];
-__constant__ int size_xy_c;
-__constant__ float _1overCutoff2_c;
-
-__global__ void binning_kernel (unsigned int n, ReconstructionSample* sample_g, unsigned int* idxKey_g,
-                                unsigned int* idxValue_g, unsigned int* binCount_g, unsigned int binsize, unsigned int gridNumElems){
-  unsigned int key;
-  unsigned int sampleIdx = blockIdx.x*blockDim.x+threadIdx.x;
-  ReconstructionSample pt;
-  unsigned int binIdx;
-  unsigned int count;
-
-  if (sampleIdx < n){
-    pt = sample_g[sampleIdx];
-
-    binIdx = (unsigned int)(pt.kZ)*size_xy_c + (unsigned int)(pt.kY)*gridSize_c[0] + (unsigned int)(pt.kX);
-    if (binCount_g[binIdx]<binsize){
-      count = atomicAdd(binCount_g+binIdx, 1);
-      if (count < binsize){
-        key = binIdx;
-      } else {
-        atomicSub(binCount_g+binIdx, 1);
-        key = gridNumElems;
-      }
-    } else {
-      key = gridNumElems;
-    }
-
-    idxKey_g[sampleIdx] = key;
-    idxValue_g[sampleIdx] = sampleIdx;
-  }
-}
-
-__global__ void reorder_kernel(int n, unsigned int* idxValue_g, ReconstructionSample* samples_g, sampleArrayStruct sortedSampleSoA_g){
-  unsigned int index = blockIdx.x*blockDim.x + threadIdx.x;
-  unsigned int old_index;
-  ReconstructionSample pt;
-
-  if (index < n){
-    old_index = idxValue_g[index];
-    pt = samples_g[old_index];
-
-    float2 data;
-    data.x = pt.real;
-    data.y = pt.imag;
-
-    float4 loc;
-    loc.x = pt.kX;
-    loc.y = pt.kY;
-    loc.z = pt.kZ;
-    loc.w = pt.sdc;
-
-    sortedSampleSoA_g.data[index] = data;
-    sortedSampleSoA_g.loc[index] = loc;
-  }
-}
-
-__device__ float kernel_value(float v){
-
-  float rValue = 0;
-
-  float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-                (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-                 0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-                 0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-                 0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-                 0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-                 0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-                 0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-                 0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = __fdividef(-num,den);
-
-  return rValue;
-}
-
-__global__ void gridding_GPU (sampleArrayStruct sortedSampleSoA_g, unsigned int* binStartAddr_g, float2* gridData_g, float* sampleDensity_g, float beta){
-  __shared__ float real_s[TILE];
-  __shared__ float imag_s[TILE];
-  __shared__ float kx_s[TILE];
-  __shared__ float ky_s[TILE];
-  __shared__ float kz_s[TILE];
-  __shared__ float sdc_s[TILE];
-
-  const int flatIdx = threadIdx.z*blockDim.y*blockDim.x+threadIdx.y*blockDim.x+threadIdx.x;
-
-  // figure out starting point of the tile
-  const int z0 = (4*blockDim.z)*(blockIdx.y/(gridSize_c[1]/blockDim.y));
-  const int y0 = blockDim.y*(blockIdx.y%(gridSize_c[1]/blockDim.y));
-  const int x0 = blockIdx.x*blockDim.x;
-
-  const int X  = x0+threadIdx.x;
-  const int Y  = y0+threadIdx.y;
-  const int Z  = z0+threadIdx.z;
-  const int Z1 = Z+blockDim.z;
-  const int Z2 = Z1+blockDim.z;
-  const int Z3 = Z2+blockDim.z;
-
-  const int xl = x0-ceil(cutoff_c);
-  const int xL = (xl < 0) ? 0 : xl;
-  const int xh = x0+blockDim.x+cutoff_c;
-  const int xH = (xh >= gridSize_c[0]) ? gridSize_c[0]-1 : xh;
-
-  const int yl = y0-ceil(cutoff_c);
-  const int yL = (yl < 0) ? 0 : yl;
-  const int yh = y0+blockDim.y+cutoff_c;
-  const int yH = (yh >= gridSize_c[1]) ? gridSize_c[1]-1 : yh;
-
-  const int zl = z0-ceil(cutoff_c);
-  const int zL = (zl < 0) ? 0 : zl;
-  const int zh = z0+(4*blockDim.z)+cutoff_c;
-  const int zH = (zh >= gridSize_c[2]) ? gridSize_c[2]-1 : zh;
-
-  const int idx = Z*size_xy_c + Y*gridSize_c[0] + X;
-  const int idx1 = idx+blockDim.z*size_xy_c;
-  const int idx2 = idx1+blockDim.z*size_xy_c;
-  const int idx3 = idx2+blockDim.z*size_xy_c;
-
-  float2 pt;
-  pt.x = 0.0;
-  pt.y = 0.0;
-  float density = 0.0;
-
-  float2 pt1;
-  pt1.x = 0.0;
-  pt1.y = 0.0;
-  float density1 = 0.0;  
-
-  float2 pt2;
-  pt2.x = 0.0;
-  pt2.y = 0.0;
-  float density2 = 0.0;
-
-  float2 pt3;
-  pt3.x = 0.0;
-  pt3.y = 0.0;
-  float density3 = 0.0;
-
-  for (int z = zL; z <= zH; z++){
-    for (int y = yL; y <= yH; y++){
-      const unsigned int *addr = binStartAddr_g+z*size_xy_c+ y*gridSize_c[0];
-      const unsigned int start = *(addr+xL);
-      const unsigned int end   = *(addr+xH+1);
-      const unsigned int delta = end-start;
-      for (int x = 0; x < ((delta+TILE-1)>>LOG_TILE); x++){
-        int tileSize = ((delta-(x<<LOG_TILE)) > TILE) ? TILE : (delta-(x<<LOG_TILE));
-        int globalIdx = flatIdx+(x<<LOG_TILE);
-        __syncthreads();
-        if(flatIdx < tileSize){
-          const float2 data = sortedSampleSoA_g.data[start+globalIdx];
-          const float4 loc  = sortedSampleSoA_g.loc [start+globalIdx];
-
-          real_s[flatIdx] = data.x;
-          imag_s[flatIdx] = data.y;
-          kx_s  [flatIdx] = loc.x;
-          ky_s  [flatIdx] = loc.y;
-          kz_s  [flatIdx] = loc.z;
-          sdc_s [flatIdx] = loc.w;
-        }
-        __syncthreads();
-
-        for (int j=0; j< tileSize; j++){
-          const float real = real_s[j];
-          const float imag = imag_s[j];
-          const float sdc = sdc_s[j];
-
-          if((real != 0.0 || imag != 0.0) && sdc != 0.0){
-            float v0 = (kx_s[j]-X)*(kx_s[j]-X);
-            v0 += (ky_s[j]-Y)*(ky_s[j]-Y);
-
-            const float v = v0 + (kz_s[j]-Z)*(kz_s[j]-Z);
-            if(v<cutoff2_c){
-              const float w = kernel_value(beta*sqrtf(1.0-(v*_1overCutoff2_c))) *sdc;
-              pt.x += w*real;
-              pt.y += w*imag;
-              density += 1.0;
-            }
-
-            const float v1 = v0 + (kz_s[j]-Z1)*(kz_s[j]-Z1);
-            if(v1<cutoff2_c){
-              const float w = kernel_value(beta*sqrtf(1.0-(v1*_1overCutoff2_c))) *sdc;
-              pt1.x += w*real;
-              pt1.y += w*imag;
-              density1 += 1.0;
-            }
-
-            const float v2 = v0 + (kz_s[j]-Z2)*(kz_s[j]-Z2);
-            if(v2<cutoff2_c){
-              const float w = kernel_value(beta*sqrtf(1.0-(v2*_1overCutoff2_c))) *sdc;
-              pt2.x += w*real;
-              pt2.y += w*imag;
-              density2 += 1.0;
-            }
-
-            const float v3 = v0 + (kz_s[j]-Z3)*(kz_s[j]-Z3);
-            if(v3<cutoff2_c){
-              const float w = kernel_value(beta*sqrtf(1.0-(v3*_1overCutoff2_c))) *sdc;
-              pt3.x += w*real;
-              pt3.y += w*imag;
-              density3 += 1.0;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  gridData_g[idx] = pt;
-  sampleDensity_g[idx] = density;
-
-  gridData_g[idx1] = pt1;
-  sampleDensity_g[idx1] = density1;
-
-  gridData_g[idx2] = pt2;
-  sampleDensity_g[idx2] = density2;
-
-  gridData_g[idx3] = pt3;
-  sampleDensity_g[idx3] = density3;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/Makefile
deleted file mode 100644
index 5ffd076cba..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=CPU_kernels.o CUDA_interface.o scanLargeArray.o sort.o main.o
-
-APP_CUDACFLAGS=-arch=compute_20
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/main.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/main.cu
deleted file mode 100644
index ebde43ebef..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/main.cu
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <cuda.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "CUDA_interface.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-#define CUERR \
-  do { \
-    cudaError_t err; \
-    if ((err = cudaGetLastError()) != cudaSuccess) { \
-      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-      return 0; \
-    } \
-  } while (0)
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-  printf("  Total amount of GPU memory: %llu bytes\n", (unsigned long long) deviceProp.totalGlobalMem);
-  printf("  Number of samples = %d\n", p->numSamples);
-  if (p->numSamples > 10000000 && deviceProp.totalGlobalMem/1024/1024 < 3000) {
-    printf("  Need at least 3GB of GPU memory for large dataset\n");
-    exit(1);
-  }
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-  
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = float(params.aquisitionMatrixSize[0])/(float(params.reconstructionMatrixSize[0])*float(params.kMax[0]));
-  kScale[1] = float(params.aquisitionMatrixSize[1])/(float(params.reconstructionMatrixSize[1])*float(params.kMax[1]));
-  kScale[2] = float(params.aquisitionMatrixSize[2])/(float(params.reconstructionMatrixSize[2])*float(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  for(int n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples; //Input Data
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  cmplx* gridData; //Output Data
-  float* sampleDensity; //Output Data
-
-  cmplx* gridData_gold; //Gold Output Data
-  float* sampleDensity_gold; //Gold Output Data
-
-  cudaMallocHost((void**)&samples, params.numSamples*sizeof(ReconstructionSample));
-  CUERR;
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate memory for input data\n");
-    exit(1);
-  }
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
-  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
-  if (sampleDensity_gold == NULL || gridData_gold == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running gold version\n");
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);
-
-  cudaMallocHost((void**)&gridData, gridNumElems*sizeof(cmplx));
-  cudaMallocHost((void**)&sampleDensity, gridNumElems*sizeof(float));
-  CUERR;
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running CUDA version\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  //Interface function to GPU implementation of gridding
-  CUDA_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  int passed=1;
-  for (int i=0; i<gridNumElems; i++){
-    if(sampleDensity[i] != sampleDensity_gold[i]) {
-      passed=0;
-      break;
-    }
-  }
-  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  cudaFreeHost(samples);
-  cudaFreeHost(gridData);
-  cudaFreeHost(sampleDensity);
-  free(gridData_gold);
-  free(sampleDensity_gold);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.cu
deleted file mode 100644
index 1066d5982c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.cu
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#define BLOCK_SIZE 1024
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernels
-////////////////////////////////////////////////////////////////////////////////
-__global__ void scan_L1_kernel(unsigned int n, unsigned int* data, unsigned int* inter)
-{
-    __shared__ unsigned int s_data[EXPANDED_SIZE(BLOCK_SIZE)]; 
-
-    unsigned int thid = threadIdx.x;
-    unsigned int g_ai = blockIdx.x*2*blockDim.x + threadIdx.x;
-    unsigned int g_bi = g_ai + blockDim.x;
-
-    unsigned int s_ai = thid;
-    unsigned int s_bi = thid + blockDim.x;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = (g_ai < n) ? data[g_ai] : 0;
-    s_data[s_bi] = (g_bi < n) ? data[g_bi] : 0;
-
-    unsigned int stride = 1;
-    for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-    {
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            s_data[bi] += s_data[ai];
-        }
-
-        stride *= 2;
-    }
-
-    if (thid == 0){
-        unsigned int last = blockDim.x*2 -1;
-        last += CONFLICT_FREE_OFFSET(last);
-        inter[blockIdx.x] = s_data[last];
-        s_data[last] = 0;
-    }
-
-    for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-    {
-        stride >>= 1;
-
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            unsigned int t  = s_data[ai];
-            s_data[ai] = s_data[bi];
-            s_data[bi] += t;
-        }
-    }
-    __syncthreads();
-
-    if (g_ai < n) { data[g_ai] = s_data[s_ai]; }
-    if (g_bi < n) { data[g_bi] = s_data[s_bi]; }
-}
-
-__global__ void scan_inter1_kernel(unsigned int* data, unsigned int iter)
-{
-    extern __shared__ unsigned int s_data[];
-
-    unsigned int thid = threadIdx.x;
-    unsigned int gthid = (blockIdx.x*blockDim.x + threadIdx.x);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = 1;
-    for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-    {
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-            s_data[bi] += s_data[ai];
-        }
-
-        stride *= 2;
-    }
-
-    __syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__global__ void scan_inter2_kernel(unsigned int* data, unsigned int iter)
-{
-    extern __shared__ unsigned int s_data[];
-
-    unsigned int thid = threadIdx.x;
-    unsigned int gthid = (blockIdx.x*blockDim.x + threadIdx.x);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = blockDim.x*2;
-
-    for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-    {
-        stride >>= 1;
-
-        __syncthreads();
-
-        if (thid < d)
-        {
-            unsigned int i  = 2*stride*thid;
-            unsigned int ai = i + stride - 1;
-            unsigned int bi = ai + stride;
-
-            ai += CONFLICT_FREE_OFFSET(ai);
-            bi += CONFLICT_FREE_OFFSET(bi);
-
-            unsigned int t  = s_data[ai];
-            s_data[ai] = s_data[bi];
-            s_data[bi] += t;
-        }
-    }
-    __syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__global__ void uniformAdd(unsigned int n, unsigned int *data, unsigned int *inter)
-{
-
-    __shared__ unsigned int uni;
-    if (threadIdx.x == 0) { uni = inter[blockIdx.x]; }
-    __syncthreads();
-
-    unsigned int g_ai = blockIdx.x*2*blockDim.x + threadIdx.x;
-    unsigned int g_bi = g_ai + blockDim.x;
-
-    if (g_ai < n) { data[g_ai] += uni; }
-    if (g_bi < n) { data[g_bi] += uni; }
-}
-
-void scanLargeArray( unsigned int gridNumElements, unsigned int* data_d) {
-    unsigned int gridNumElems = gridNumElements;    
-
-    // allocate device memory input and output arrays
-    unsigned int* inter_d = NULL;
-
-    // Run the prescan
-    unsigned int size = (gridNumElems+BLOCK_SIZE-1)/BLOCK_SIZE;
-
-    unsigned int dim_block;
-    unsigned int current_max = size*BLOCK_SIZE;
-    for (int block_size = 128; block_size <= BLOCK_SIZE; block_size *= 2){
-      unsigned int array_size = block_size;
-      while(array_size < size){
-        array_size *= block_size;
-      }
-      if (array_size <= current_max){
-        current_max = array_size;
-        dim_block = block_size;
-      }
-    }
-
-    cudaMalloc( (void**) &inter_d, current_max*sizeof(unsigned int));
-    cudaMemset (inter_d, 0, current_max*sizeof(unsigned int));
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++){
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*BLOCK_SIZE)) > (GRID_SIZE*BLOCK_SIZE)) ? (GRID_SIZE*BLOCK_SIZE) : (gridNumElems-(i*GRID_SIZE*BLOCK_SIZE));
-
-        dim3 block (BLOCK_SIZE/2);
-        dim3 grid (gridSize);
-        scan_L1_kernel<<<grid, block>>>(numElems, data_d+(i*GRID_SIZE*BLOCK_SIZE), inter_d+(i*GRID_SIZE));
-    }
-
-    unsigned int stride = 1;
-    for (unsigned int d = current_max; d > 1; d /= dim_block)
-    {
-        dim3 block (dim_block/2);
-        dim3 grid (d/dim_block);
-
-        scan_inter1_kernel<<<grid, block, EXPANDED_SIZE(dim_block)*sizeof(unsigned int)>>>(inter_d, stride);
-
-        stride *= dim_block;
-    }
-
-    cudaMemset(&(inter_d[current_max-1]), 0, sizeof(unsigned int));
-
-    for (unsigned int d = dim_block; d <= current_max; d *= dim_block)
-    {
-        stride /= dim_block;
-        dim3 block (dim_block/2);
-        dim3 grid (d/dim_block);
-
-        scan_inter2_kernel<<<grid, block, EXPANDED_SIZE(dim_block)*sizeof(unsigned int)>>>(inter_d, stride);
-    }
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++){
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*BLOCK_SIZE)) > (GRID_SIZE*BLOCK_SIZE)) ? (GRID_SIZE*BLOCK_SIZE) : (gridNumElems-(i*GRID_SIZE*BLOCK_SIZE));
-
-        dim3 block (BLOCK_SIZE/2);
-        dim3 grid (gridSize);
-
-        uniformAdd<<<grid, block>>>(numElems, data_d+(i*GRID_SIZE*BLOCK_SIZE), inter_d+(i*GRID_SIZE));
-    }
-
-    cudaFree(inter_d);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.h
deleted file mode 100644
index a413fd6b7c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/scanLargeArray.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void scanLargeArray(unsigned int gridNumElements, unsigned int* data_d);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.cu b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.cu
deleted file mode 100644
index c071eb87fb..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.cu
+++ /dev/null
@@ -1,254 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <cuda.h>
-#include <stdio.h>
-
-#include "scanLargeArray.h"
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-#define CONFLICT_FREE_OFFSET(index) ((index) >> LNB + (index) >> (2*LNB))
-#define BLOCK_P_OFFSET (4*SORT_BS+1+(4*SORT_BS+1)/16+(4*SORT_BS+1)/64)
-
-__device__ void scan (unsigned int s_data[BLOCK_P_OFFSET]){
-  unsigned int thid = threadIdx.x;
-
-  __syncthreads();
-
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)] += s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))];
-
-  unsigned int stride = 2;
-  for (unsigned int d = blockDim.x; d > 0; d >>= 1)
-  {
-    __syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  if (thid == 0){
-    unsigned int last = 4*blockDim.x-1;
-    last += CONFLICT_FREE_OFFSET(last);
-    s_data[4*blockDim.x+CONFLICT_FREE_OFFSET(4*blockDim.x)] = s_data[last];
-    s_data[last] = 0;
-  }
-
-  for (unsigned int d = 1; d <= blockDim.x; d *= 2)
-  {
-    stride >>= 1;
-
-    __syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t  = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-  __syncthreads();
-
-  unsigned int temp = s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)] = s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)];
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += temp;
-
-  unsigned int temp2 = s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))];
-  s_data[2*(blockDim.x+thid)+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid))] = s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)];
-  s_data[2*(blockDim.x+thid)+1+CONFLICT_FREE_OFFSET(2*(blockDim.x+thid)+1)] += temp2;
-
-  __syncthreads();
-}
-
-__global__ static void splitSort(int numElems, int iter, unsigned int* keys, unsigned int* values, unsigned int* histo)
-{
-    __shared__ unsigned int flags[BLOCK_P_OFFSET];
-    __shared__ unsigned int histo_s[1<<BITS];
-
-    const unsigned int tid = threadIdx.x;
-    const unsigned int gid = blockIdx.x*4*SORT_BS+4*threadIdx.x;
-
-    // Copy input to shared mem. Assumes input is always even numbered
-    uint4 lkey = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
-    uint4 lvalue;
-    if (gid < numElems){
-      lkey = *((uint4*)(keys+gid));
-      lvalue = *((uint4*)(values+gid));
-    }
-
-    if(tid < (1<<BITS)){
-      histo_s[tid] = 0;
-    }
-    __syncthreads();
-
-    atomicAdd(histo_s+((lkey.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atomicAdd(histo_s+((lkey.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-
-    uint4 index = {4*tid, 4*tid+1, 4*tid+2, 4*tid+3};
-
-    for (int i=BITS*iter; i<BITS*(iter+1);i++){
-      const uint4 flag = {(lkey.x>>i)&0x1,(lkey.y>>i)&0x1,(lkey.z>>i)&0x1,(lkey.w>>i)&0x1};
-
-      flags[index.x+CONFLICT_FREE_OFFSET(index.x)] = 1<<(16*flag.x);
-      flags[index.y+CONFLICT_FREE_OFFSET(index.y)] = 1<<(16*flag.y);
-      flags[index.z+CONFLICT_FREE_OFFSET(index.z)] = 1<<(16*flag.z);
-      flags[index.w+CONFLICT_FREE_OFFSET(index.w)] = 1<<(16*flag.w);
-
-      scan (flags);
-
-      index.x = (flags[index.x+CONFLICT_FREE_OFFSET(index.x)]>>(16*flag.x))&0xFFFF;
-      index.y = (flags[index.y+CONFLICT_FREE_OFFSET(index.y)]>>(16*flag.y))&0xFFFF;
-      index.z = (flags[index.z+CONFLICT_FREE_OFFSET(index.z)]>>(16*flag.z))&0xFFFF;
-      index.w = (flags[index.w+CONFLICT_FREE_OFFSET(index.w)]>>(16*flag.w))&0xFFFF;
-
-      unsigned short offset = flags[4*blockDim.x+CONFLICT_FREE_OFFSET(4*blockDim.x)]&0xFFFF;
-      index.x += (flag.x) ? offset : 0;
-      index.y += (flag.y) ? offset : 0;
-      index.z += (flag.z) ? offset : 0;
-      index.w += (flag.w) ? offset : 0;
-
-      __syncthreads();
-    }
-
-    // Write result.
-    if (gid < numElems){
-      keys[blockIdx.x*4*SORT_BS+index.x] = lkey.x;
-      keys[blockIdx.x*4*SORT_BS+index.y] = lkey.y;
-      keys[blockIdx.x*4*SORT_BS+index.z] = lkey.z;
-      keys[blockIdx.x*4*SORT_BS+index.w] = lkey.w;
-
-      values[blockIdx.x*4*SORT_BS+index.x] = lvalue.x;
-      values[blockIdx.x*4*SORT_BS+index.y] = lvalue.y;
-      values[blockIdx.x*4*SORT_BS+index.z] = lvalue.z;
-      values[blockIdx.x*4*SORT_BS+index.w] = lvalue.w;
-    }
-    if (tid < (1<<BITS)){
-      histo[gridDim.x*threadIdx.x+blockIdx.x] = histo_s[tid];
-    }
-}
-
-__global__ void splitRearrange (int numElems, int iter, unsigned int* keys_i, unsigned int* keys_o, unsigned int* values_i, unsigned int* values_o, unsigned int* histo){
-  __shared__ unsigned int histo_s[(1<<BITS)];
-  __shared__ unsigned int array_s[4*SORT_BS];
-  int index = blockIdx.x*4*SORT_BS + 4*threadIdx.x;
-
-  if (threadIdx.x < (1<<BITS)){
-    histo_s[threadIdx.x] = histo[gridDim.x*threadIdx.x+blockIdx.x];
-  }
-
-  uint4 mine, value;
-  if (index < numElems){
-    mine = *((uint4*)(keys_i+index));
-    value = *((uint4*)(values_i+index));
-  } else {
-    mine.x = UINT32_MAX;
-    mine.y = UINT32_MAX;
-    mine.z = UINT32_MAX;
-    mine.w = UINT32_MAX;
-  }
-  uint4 masks = {(mine.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)};
-
-  ((uint4*)array_s)[threadIdx.x] = masks;
-  __syncthreads();
-
-  uint4 new_index = {histo_s[masks.x],histo_s[masks.y],histo_s[masks.z],histo_s[masks.w]};
-
-  int i = 4*threadIdx.x-1;
-  while (i >= 0){
-    if (array_s[i] == masks.x){
-      new_index.x++;
-      i--;
-    } else {
-      break;
-    }
-  }
-
-  new_index.y = (masks.y == masks.x) ? new_index.x+1 : new_index.y;
-  new_index.z = (masks.z == masks.y) ? new_index.y+1 : new_index.z;
-  new_index.w = (masks.w == masks.z) ? new_index.z+1 : new_index.w;
-
-  if (index < numElems){
-    keys_o[new_index.x] = mine.x;
-    values_o[new_index.x] = value.x;
-
-    keys_o[new_index.y] = mine.y;
-    values_o[new_index.y] = value.y;
-
-    keys_o[new_index.z] = mine.z;
-    values_o[new_index.z] = value.z;
-
-    keys_o[new_index.w] = mine.w;
-    values_o[new_index.w] = value.w;
-  }
-}
-
-void sort (int numElems, unsigned int max_value, unsigned int* &dkeys, unsigned int* &dvalues){
-  dim3 grid ((numElems+4*SORT_BS-1)/(4*SORT_BS));
-  dim3 block (SORT_BS);
-
-  unsigned int iterations = 0;
-  while(max_value > 0){
-    max_value >>= BITS;
-    iterations++;
-  }
-
-  unsigned int *dhisto;
-  unsigned int *dkeys_o, *dvalues_o;
-
-  cudaMalloc((void**)&dhisto, (1<<BITS)*grid.x*sizeof(unsigned int));
-  cudaMalloc((void**)&dkeys_o, numElems*sizeof(unsigned int));
-  cudaMalloc((void**)&dvalues_o, numElems*sizeof(unsigned int));
-
-  for (int i=0; i<iterations; i++){
-    splitSort<<<grid,block>>>(numElems, i, dkeys, dvalues, dhisto);
-
-    scanLargeArray(grid.x*(1<<BITS), dhisto);
-
-    splitRearrange<<<grid,block>>>(numElems, i, dkeys, dkeys_o, dvalues, dvalues_o, dhisto);
-
-    unsigned int* temp = dkeys;
-    dkeys = dkeys_o;
-    dkeys_o = temp;
-
-    temp = dvalues;
-    dvalues = dvalues_o;
-    dvalues_o = temp;
-  }
-
-  cudaFree(dkeys_o);
-  cudaFree(dvalues_o);
-  cudaFree(dhisto);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.h
deleted file mode 100644
index 3d150bb2fb..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/cuda/sort.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-void sort (int numElems, unsigned int maxValue, unsigned int* &dkeys, unsigned int* &dvalues);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.c
deleted file mode 100644
index 7e8862e688..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-
-#pragma omp parallel for private(v)
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = ((float)(params.kernelWidth))/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                    // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;                  // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-
-#pragma omp parallel for private(NxL, NxH, NyL, NyH, NzL, NzH, dz2, nz, dx2,         \
-				 nx, dy2, ny, idxZ, idxY, dy2dz2, idx0, v, idx, w)
-
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-#pragma omp critical (c1)
-                  gridData[idx].real += (w*pt.real);
-#pragma omp critical (c2)
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-#pragma omp critical (c3)
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.h
deleted file mode 100644
index 42d40a0373..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/CPU_kernels.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/Makefile
deleted file mode 100644
index 4b1e7501f4..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=CPU_kernels.o main.o
-APP_CFLAGS = -fopenmp
-APP_LDFLAGS = -lgomp
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/main.c
deleted file mode 100644
index 12447e56fe..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/omp_base/main.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "UDTypes.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  printf("  Number of samples = %d\n", p->numSamples);
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = (float)(params.aquisitionMatrixSize[0])/((float)(params.reconstructionMatrixSize[0])*(float)(params.kMax[0]));
-  kScale[1] = (float)(params.aquisitionMatrixSize[1])/((float)(params.reconstructionMatrixSize[1])*(float)(params.kMax[1]));
-  kScale[2] = (float)(params.aquisitionMatrixSize[2])/((float)(params.reconstructionMatrixSize[2])*(float)(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  int n;
-  for(n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples = (ReconstructionSample*) malloc (params.numSamples*sizeof(ReconstructionSample)); //Input Data
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  int gridNumElems = params.gridSize[0]*params.gridSize[1]*params.gridSize[2];
-
-  cmplx* gridData = (cmplx*) calloc (gridNumElems, sizeof(cmplx)); //Output Data
-  float* sampleDensity = (float*) calloc (gridNumElems, sizeof(float)); //Output Data
-
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate memory for input data\n");
-    exit(1);
-  }
-
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  int passed=1;
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  free(samples);
-  free(gridData);
-  free(sampleDensity);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.c
deleted file mode 100644
index 43614d9a5f..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.h
deleted file mode 100644
index 1d883f00f7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/CPU_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size,
-                 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/GPU_kernels.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/GPU_kernels.cl
deleted file mode 100644
index 1c2328a7a1..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/GPU_kernels.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable 
- 
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-#define TILE 64
-#define LOG_TILE 6
-
-__kernel void binning_kernel (unsigned int n, 
-                              __global ReconstructionSample* sample_g, 
-                              __global unsigned int* idxKey_g,
-                              __global unsigned int* idxValue_g, 
-                              __global unsigned int* binCount_g, 
-                              unsigned int binsize, unsigned int gridNumElems){
-  unsigned int key;
-  unsigned int sampleIdx = get_global_id(0); //blockIdx.x*blockDim.x+threadIdx.x;
-  ReconstructionSample pt;
-  unsigned int binIdx;
-  unsigned int count;
-
-  if (sampleIdx < n){
-    pt = sample_g[sampleIdx];
-    
-    binIdx = (unsigned int)(pt.kZ)*((int) ( SIZE_XY_VAL )) + (unsigned int)(pt.kY)*((int)( GRIDSIZE_VAL1 )) + (unsigned int)(pt.kX);
-
-    count = atom_add(binCount_g+binIdx, 1);
-    if (count < binsize){
-      key = binIdx;
-    } else {
-      atom_sub(binCount_g+binIdx, 1);
-      key = gridNumElems;
-    }
-
-    idxKey_g[sampleIdx] = key;
-    idxValue_g[sampleIdx] = sampleIdx;
-  }
-}
-
-__kernel void reorder_kernel(int n, 
-                               __global unsigned int* idxValue_g, 
-                               __global ReconstructionSample* samples_g, 
-                               __global ReconstructionSample* sortedSample_g){
-  unsigned int index = get_global_id(0); //blockIdx.x*blockDim.x + threadIdx.x;
-  unsigned int old_index;
-  ReconstructionSample pt;
-
-  if (index < n){
-    old_index = idxValue_g[index];
-    pt = samples_g[old_index];
-    sortedSample_g[index] = pt;
-  }
-}
-
-float kernel_value(float v){
-
-  float rValue = 0;
-
-  float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-                (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-                 0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-                 0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-                 0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-                 0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-                 0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-                 0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-                 0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = native_divide(-num,den);
-//rValue = (-1*num) / den;
-
-  return rValue;
-}
-
-__kernel void gridding_GPU (__global ReconstructionSample* sample_g, 
-                              __global unsigned int* binStartAddr_g, 
-                              __global float2* gridData_g, 
-                              __global float* sampleDensity_g, 
-                              float beta
-                              ){
-  __local ReconstructionSample sharedBin[TILE];
-
-  const int flatIdx = //get_global_id(0); // This does not work, at least as is
-  get_local_id(2)*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0);
-  //threadIdx.z*blockDim.y*blockDim.x+threadIdx.y*blockDim.x+threadIdx.x;
-
-  // figure out starting point of the tile
-  const int z0 = get_local_size(2)*(get_group_id(1)/(GRIDSIZE_VAL2/get_local_size(1)));
-  const int y0 = get_local_size(1)*(get_group_id(1)%(GRIDSIZE_VAL2/get_local_size(1)));
-  const int x0 = get_group_id(0)*get_local_size(0);
-
-  const int X  = x0+get_local_id(0);
-  const int Y  = y0+get_local_id(1);
-  const int Z  = z0+get_local_id(2);
-
-  const int xl = x0-CEIL_CUTOFF_VAL;
-  const int xL = (xl < 0) ? 0 : xl;
-  const int xh = x0+get_local_size(0)+CUTOFF_VAL;
-  const int xH = (xh >= GRIDSIZE_VAL1) ? GRIDSIZE_VAL1-1 : xh;
-
-  const int yl = y0-CEIL_CUTOFF_VAL;
-  const int yL = (yl < 0) ? 0 : yl;
-  const int yh = y0+get_local_size(1)+CUTOFF_VAL;
-  const int yH = (yh >= GRIDSIZE_VAL2) ? GRIDSIZE_VAL2-1 : yh;
-
-  const int zl = z0-CEIL_CUTOFF_VAL;
-  const int zL = (zl < 0) ? 0 : zl;
-  const int zh = z0+get_local_size(2)+CUTOFF_VAL;
-  const int zH = (zh >= GRIDSIZE_VAL3) ? GRIDSIZE_VAL3-1 : zh;
-
-  const int idx = Z*SIZE_XY_VAL + Y*GRIDSIZE_VAL1 + X;
-
-  float2 pt = (float2) (0.0f, 0.0f);
-  float density = 0.0f;
-  
-
-  for (int z = zL; z <= zH; z++){
-    for (int y = yL; y <= yH; y++){
-      __global const unsigned int *addr = binStartAddr_g+z*SIZE_XY_VAL+ y*GRIDSIZE_VAL1;
-      const unsigned int start = *(addr+xL);
-      const unsigned int end   = *(addr+xH+1);
-      const unsigned int delta = end-start;
-      for (int x = 0; x < ((delta+TILE-1)>>LOG_TILE); x++){
-        int tileSize = ((delta-(x<<LOG_TILE)) > TILE) ? TILE : (delta-(x<<LOG_TILE));
-        int globalIdx = flatIdx+(x<<LOG_TILE);
-        barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-        if(flatIdx < tileSize){
-          sharedBin[flatIdx] = sample_g[start+globalIdx];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-
-        for (int j=0; j< tileSize; j++){
-          const float real = sharedBin[j].real;
-          const float imag = sharedBin[j].imag;
-          const float sdc = sharedBin[j].sdc;
-
-          if((real != 0.0f || imag != 0.0f) && sdc != 0.0f){
-            float v = (sharedBin[j].kX-X)*(sharedBin[j].kX-X);
-            v += (sharedBin[j].kY-Y)*(sharedBin[j].kY-Y);
-            v += (sharedBin[j].kZ-Z)*(sharedBin[j].kZ-Z);
-            if(v<CUTOFF2_VAL){
-               const float w = kernel_value(beta*sqrt(1.0f-(v*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt.x += w*real;
-              pt.y += w*imag;
-              density += 1.0f;
-              
-            }
-          }
-        }                
-      }
-    }
-  }
-
-  gridData_g[idx] = pt;
-  sampleDensity_g[idx] = density;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/Makefile
deleted file mode 100644
index 46bafdc413..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=CPU_kernels.o main.o OpenCL_interface.o scanLargeArray.o sort.o OpenCL_common.o
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.cpp
deleted file mode 100644
index 57368eda9a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.h
deleted file mode 100644
index b063d9c696..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.cpp
deleted file mode 100644
index b68d07b467..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "scanLargeArray.h"
-#include "CPU_kernels.h"
-
-#include "sort.h"
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-
-#define BLOCKSIZE 512
-#define PI 3.14159265359
-
-/***********************************************************************
- * CUDA_interface is the main function for GPU execution. This
- * implementation uses compact binning to distribute input elements
- * into unit-cubed sized bins. The bins are then visited by GPU
- * threads, where every thread computes the value of one (or small set)
- * of output elements by computing the contributions of elements in 
- * neighboring bins to these output elements.
- *
- * The bins have a limited bin size and everything beyond that bin size
- * is offloaded to the CPU to be computed in parallel with the GPU
- * gridding.
- ***********************************************************************/
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  cl_context clContext,
-  cl_command_queue clCommandQueue, //const cl_device clDevice
-  const cl_device_id clDevice,
-  size_t *workItemSizes // maximum size of work-items for each dimension
-){
-
-  /* Initializing all variables */
-  size_t blockSize = workItemSizes[0];
-  int dims[3] = {8,4,2}; //size of a gridding block on the GPU
-
-  /* x, y, z dimensions of the output grid (gridData) */
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-  int size_xy = size_y*size_x;
-
-  int gridNumElems = size_x * size_y * size_z;  // Total number of grid points
-  
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  float cutoff = float(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  /* Declarations of device data structures */
-  cl_int ciErrNum;
-  cl_mem sample_d;    // Device array for original input array
-  cl_mem sortedSample_d;             // Device array of the sorted (into bins) input elements.
-  
-                                            // This array is accessed by sortedSampleSoA_d in a structure
-                                            //   of arrays manner.
-  cl_mem gridData_d;                // Device array for output grid
-  cl_mem sampleDensity_d;            // Device array for output sample density
-  cl_mem idxKey_d;            // Array of bin indeces generated in the binning kernel
-                                            //   and used to sort the input elements into their
-                                            //   corresponding bins
-  cl_mem idxValue_d;          // This array holds the indices of input elements in the
-                                            //   the original array. This array is sorted using the
-                                            //   the idxKey_d array, and once sorted, it is used in
-                                            //   the reorder kernel to move the actual elements into
-                                            //   their corresponding bins.
-  //cl_mem binCount_d;          // Zero-initialized array which counts the number of elements
-                                            //   put in each bin. Based on this array, we determine which
-                                            //   elements get offloaded to the CPU
-  cl_mem binStartAddr_d;      // Array of start offset of each of the compact bins
-  
-  cl_mem *idxValue_dPtr;
-  cl_mem *idxKey_dPtr;
-  
-  cl_program gpu_kernels;
-  cl_kernel binning_kernel;
-  cl_kernel reorder_kernel;
-  cl_kernel gridding_GPU;
-
-  /* Allocating device memory */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  
-  unsigned int *zeroData = NULL, *maxIntData = NULL;
-  
-  size_t sizeZeroData = sizeof(float)* 2 * gridNumElems;
-  if ( n*sizeof(ReconstructionSample) > sizeZeroData) {
-    sizeZeroData = n*sizeof(ReconstructionSample);
-  }    
-  if ( (sizeof(unsigned int) * (gridNumElems+1)) > sizeZeroData) {
-    // Not going to be taken, but included just in case since this is used for multiple variables
-    sizeZeroData = sizeof(unsigned int) * (gridNumElems+1);
-  }
-  if ( (((n+3)/4)*4)*sizeof(unsigned int) > sizeZeroData) {
-    sizeZeroData = (((n+3)/4)*4)*sizeof(unsigned int);
-  }
-  
-  zeroData = (unsigned int *) malloc(sizeZeroData);
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  maxIntData = (unsigned int *) malloc((((n+3)/4)*4)*sizeof(unsigned int));
-  if (maxIntData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  
-  memset(zeroData, 0, sizeZeroData);
-  // Initialize padding to max integer value, so that when sorted,
-  // these elements get pushed to the end of the array.
-  memset(maxIntData+n, 0xFF, (((n+3)&~(3))-n)*sizeof(unsigned int));
-
-  sortedSample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n*sizeof(ReconstructionSample), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  binStartAddr_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (gridNumElems+1)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n*sizeof(ReconstructionSample), sample, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  idxKey_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), maxIntData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //Pad to nearest multiple of 4 to 
-  idxValue_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //satisfy a property of the sorting kernel.
-  
-  idxKey_dPtr = &idxKey_d;
-  idxValue_dPtr = &idxValue_d;
-  
-  pb_SwitchToTimer(timers, pb_TimerID_DRIVER);
-
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f -D CEIL_CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff, ceil(cutoff),
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_base/GPU_kernels.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s) \n", __FILE__); exit(1);
-  }
-  	
-  gpu_kernels = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(gpu_kernels, 1, &clDevice, compileOptions, NULL, NULL) );
-  
-  /*
-  // Uncomment to view build log from compiler for debugging
-  char *build_log;
-  size_t ret_val_size;
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-  build_log = (char *)malloc(ret_val_size+1);
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  OCL_ERRCK_VAR(ciErrNum);
-       	
-  // to be carefully, terminate with \0
-  // there's no information in the reference whether the string is 0 terminated or not
-  build_log[ret_val_size] = '\0';
-
-  fprintf(stderr, "%s\n", build_log );
-  */
-  
-  
-  binning_kernel = clCreateKernel(gpu_kernels, "binning_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  reorder_kernel = clCreateKernel(gpu_kernels, "reorder_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  gridding_GPU = clCreateKernel(gpu_kernels, "gridding_GPU", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-                     
-  free(maxIntData);  
-  
-  pb_SwitchToTimer(timers, pb_TimerID_DRIVER);
-  
-  size_t block1[1] = { blockSize };
-  size_t grid1[1] = { ((n+blockSize-1)/blockSize)*block1[0] };
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 1, sizeof(cl_mem), (void *)&sample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 2, sizeof(cl_mem), (void *)idxKey_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 3, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 4, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 5, sizeof(int), &(params.binsize)) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 6, sizeof(unsigned int), &gridNumElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 2, sizeof(cl_mem), (void *)&sample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 3, sizeof(cl_mem), (void *)&sortedSample_d) );
-  
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 1: Perform binning. This kernel determines which output bin each input element
-   * goes into. Any excess (beyond binsize) is put in the CPU bin
-   */
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, binning_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  /* STEP 2: Sort the index-value pair generate in the binning kernel */
-  cl_mem dkeys_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  cl_mem dvalues_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  
-  cl_mem *dkeys_oPtr = &dkeys_o;
-  cl_mem *dvalues_oPtr = &dvalues_o;
-  
-  cl_mem *beforePointer = idxKey_dPtr;
-
-  sort(n, gridNumElems+1, idxKey_dPtr, idxValue_dPtr, dkeys_oPtr, dvalues_oPtr, &clContext, clCommandQueue, clDevice, workItemSizes);
-
-  /* STEP 3: Reorder the input data, based on the sorted values from Step 2.
-   * this step also involves changing the data from array of structs to a struct
-   * of arrays. Also in this kernel, we populate an array with the starting index
-   * of every output bin features in the input array, based on the sorted indices 
-   * from Step 2.
-   * At the end of this step, we copy the start address and list of input elements
-   * that will be computed on the CPU.
-   */
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 1, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, reorder_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxValue_dPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxKey_dPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sample_d) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 4: In this step we generate the ADD scan of the array of starting indices
-   * of the output bins. The result is an array that contains the starting address of
-   * every output bin.
-   */
-  scanLargeArray(gridNumElems+1, binStartAddr_d, clContext, clCommandQueue, clDevice, workItemSizes);
-  
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  // Copy back to the CPU the indices of the input elements that will be processed on the CPU
-  unsigned int cpuStart;
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, binStartAddr_d, CL_TRUE, 
-                          gridNumElems*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to read
-                          &cpuStart, // Host Source
-                          0, NULL, NULL) );
-
-  int CPUbin_size = int(n)-int(cpuStart);
-  
-  ReconstructionSample* CPUbin;
-
-  CPUbin = (ReconstructionSample *) malloc ( CPUbin_size*sizeof(ReconstructionSample) );
-  if (CPUbin == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, sortedSample_d, CL_TRUE, 
-                          cpuStart*sizeof(ReconstructionSample), // Offset in bytes
-                          CPUbin_size*sizeof(ReconstructionSample), // Size of data to read
-                          CPUbin, // Host Source
-                          0, NULL, NULL) );
-
-  /* STEP 5: Perform the binning on the GPU. The results are computed in a gather fashion
-   * where each thread computes the value of one output element by reading the relevant
-   * bins.
-   */
-  gridData_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(cmplx), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sampleDensity_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(float), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  
-  free(zeroData);
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-  
-  size_t block2[3] = {dims[0], dims[1], dims[2]};
-  size_t grid2[3] = { (size_x/dims[0]) * block2[0], ((size_y*size_z)/(dims[1]*dims[2])) * block2[1], 1 * block2[2] };
-
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 0, sizeof(cl_mem), (void *)&sortedSample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 1, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 2, sizeof(cl_mem), (void *)&gridData_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 3, sizeof(cl_mem), (void *)&sampleDensity_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 4, sizeof(float), &beta) );
-  
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, gridding_GPU, 3, 0,
-                            grid2, block2, 0, 0, 0) );
-                                
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(binStartAddr_d) );
-  
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-                       
-  /* Copying the results from the Device to the Host */
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, sampleDensity_d, CL_FALSE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(float), // Size of data to write
-                          sampleDensity, // Host Source
-                          0, NULL, NULL) );
-                          
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, gridData_d, CL_TRUE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(cmplx), // Size of data to write
-                          gridData, // Host Source
-                          0, NULL, NULL) );                          
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  /* STEP 6: Computing the contributions of the sample points handled by the Host
-   * and adding those to the GPU results.
-   */
-  gridding_Gold(CPUbin_size, params, CPUbin, LUT, sizeLUT, gridData, sampleDensity);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  free(CPUbin);
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(gridData_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sampleDensity_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sortedSample_d) );
-  
-  pb_SwitchToTimer(timers, pb_TimerID_NONE);
-
-  return;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.h
deleted file mode 100644
index 0d39e9bb1a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/OpenCL_interface.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  const cl_context clContext,  // Pointer to OpenCL Context created by Host
-  const cl_command_queue clCommandQueue,
-  const cl_device_id clDevice,
-  size_t *workItemSizes
-);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/main.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/main.cpp
deleted file mode 100644
index f8e07e0fe8..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/main.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "OpenCL_interface.h"
-#include "OpenCL_common.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-  cl_device_id clDevice;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-  if (deviceFound < 0) {
-    fprintf(stderr, "No suitable device was found\n");
-    exit(1);
-  }
-  cl_ulong mem_size;
-  clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
-
-  printf("  Number of samples = %d\n", p->numSamples);
-  printf("  Total amount of GPU memory: %llu bytes\n", (unsigned long long) mem_size);
-  if (p->numSamples > 10000000 && mem_size/1024/1024 < 3000) {
-    printf("  Need at least 3GB of GPU memory for large dataset\n");
-    exit(1);
-  }
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = float(params.aquisitionMatrixSize[0])/(float(params.reconstructionMatrixSize[0])*float(params.kMax[0]));
-  kScale[1] = float(params.aquisitionMatrixSize[1])/(float(params.reconstructionMatrixSize[1])*float(params.kMax[1]));
-  kScale[2] = float(params.aquisitionMatrixSize[2])/(float(params.reconstructionMatrixSize[2])*float(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  for(int n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples; //Input Data
-//  cl_mem samplesPin; 
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  cmplx* gridData; //Output Data
-  float* sampleDensity; //Output Data
-//  cl_mem gridDataPin;
-//  cl_mem sampleDensityPin;
-
-  cmplx* gridData_gold; //Gold Output Data
-  float* sampleDensity_gold; //Gold Output Data
-  
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-  cl_device_id clDevice;
-  cl_context clContext;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-
-  size_t max_alloc_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, 0);
-  size_t global_mem_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &global_mem_size, 0);
-
-  size_t samples_size = params.numSamples*sizeof(ReconstructionSample);
-  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
-  size_t output_size = gridNumElems*sizeof(cmplx);
-
-  if ( (deviceFound < 0 ) ||
-       ((samples_size+output_size) > global_mem_size) ||
-       (samples_size > max_alloc_size) || 
-       (output_size > max_alloc_size ) ) {
-    fprintf(stderr, "No suitable device was found\n");
-    if(deviceFound >= 0) {
-      fprintf(stderr, "Memory requirements for this dataset exceed device capabilities\n");
-    }
-    exit(1);
-  }
-  
-  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  cl_uint workItemDimensions;
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
-  
-  size_t workItemSizes[workItemDimensions];
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-    
-    /*
-  samplesPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      params.numSamples*sizeof(ReconstructionSample),
-      NULL, &ciErrNum);
-*/
-  samples = (ReconstructionSample *) malloc ( params.numSamples*sizeof(ReconstructionSample) );
-  
-  /*(ReconstructionSample *) clEnqueueMapBuffer(clCommandQueue, samplesPin, CL_TRUE, CL_MAP_WRITE, 0, params.numSamples*sizeof(ReconstructionSample), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-*/
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate and map memory for input data\n");
-    exit(1);
-  }
-
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
-  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
-  if (sampleDensity_gold == NULL || gridData_gold == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running gold version\n");
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);
-
-  printf("Running OpenCL version\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-/*
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, samplesPin, CL_TRUE, 
-                          0, // Offset in bytes
-                          n*sizeof(ReconstructionSample), // Size of data to write
-                          samples, // Host Source
-  
-                          0, NULL, NULL) );*/
- // OCL_ERRCK_RETVAL ( clFinish(clCommandQueue) );
- 
- /*
-  gridDataPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(cmplx), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  gridData = (cmplx *) malloc ( gridNumElems*sizeof(cmplx) );
-  if (gridData == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(cmplx *) clEnqueueMapBuffer(clCommandQueue, gridDataPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(cmplx), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  /*
-  sampleDensityPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(float), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  sampleDensity = (float *) malloc ( gridNumElems*sizeof(float) );
-  if (sampleDensity == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(float *) clEnqueueMapBuffer(clCommandQueue, sampleDensityPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(float), 0, NULL, NULL, &ciErrNum);
-  */
-  
-  OCL_ERRCK_VAR(ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  
-  //Interface function to GPU implementation of gridding
-  OpenCL_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  int passed=1;
-  for (int i=0; i<gridNumElems; i++){
-    if(sampleDensity[i] != sampleDensity_gold[i]) {
-      passed=0;
-      break;
-    }
-  }
-  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  
-  /*
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, samplesPin, samples, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, gridDataPin, gridData, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, sampleDensityPin, sampleDensity, 0, NULL, NULL) );
-  
-  clReleaseMemObject(samplesPin);
-  clReleaseMemObject(gridDataPin);
-  clReleaseMemObject(sampleDensityPin);
-  */
-  
-  free(samples);
-  free(gridData);
-  free(sampleDensity);
-  
-  
-  free(gridData_gold);
-  free(sampleDensity_gold);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cl
deleted file mode 100644
index c45978a38b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cl
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#define BLOCK_SIZE 1024
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
-#define LNB LOG_NUM_BANKS
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernels
-////////////////////////////////////////////////////////////////////////////////
-__kernel void scan_L1_kernel(unsigned int n, __global unsigned int* dataBase, unsigned int data_offset, __global unsigned int* interBase, unsigned int inter_offset)
-{
-    __local unsigned int s_data[EXPANDED_SIZE(BLOCK_SIZE)]; 
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-
-    unsigned int thid = get_local_id(0);
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    unsigned int s_ai = thid;
-    unsigned int s_bi = thid + get_local_size(0);
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = (g_ai < n) ? data[g_ai] : 0;
-    s_data[s_bi] = (g_bi < n) ? data[g_bi] : 0;
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        s_data[bi] += s_data[ai];
-      }
-
-        stride *= 2;
-    }
-
-    if (thid == 0) {
-      unsigned int last = get_local_size(0)*2 -1;
-      last += CONFLICT_FREE_OFFSET(last);
-      inter[get_group_id(0)] = s_data[last];
-      s_data[last] = 0;
-    }
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    if (g_ai < n) { data[g_ai] = s_data[s_ai]; }
-    if (g_bi < n) { data[g_bi] = s_data[s_bi]; }
-}
-
-
-
-__kernel void scan_inter1_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-        s_data[bi] += s_data[ai];
-      }
-
-      stride *= 2;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__kernel void scan_inter2_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = get_local_size(0)*2;
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-
-__kernel void uniformAdd(unsigned int n, __global unsigned int *dataBase, unsigned int data_offset, __global unsigned int *interBase, unsigned int inter_offset)
-{
-    __local unsigned int uni;
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-       
-    if (get_local_id(0) == 0) { uni = inter[get_group_id(0)]; }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    if (g_ai < n) { data[g_ai] += uni; }
-    if (g_bi < n) { data[g_bi] += uni; }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cpp
deleted file mode 100644
index 235bbd718d..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <CL/cl.h>
-#include "OpenCL_common.h"
-
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-void scanLargeArray( unsigned int gridNumElems, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes) {
-
-    size_t blockSize = (workItemSizes[0]*2 < 1024) ? workItemSizes[0]*2 : 1024;
-
-    // Run the prescan
-    unsigned int size = (gridNumElems+blockSize-1)/blockSize;
-    
-    unsigned int dim_block;
-    unsigned int current_max = size*blockSize;
-    for (int block_size_lcv = 128; block_size_lcv <= blockSize; block_size_lcv *= 2){
-      unsigned int array_size = block_size_lcv;
-      while(array_size < size){
-        array_size *= block_size_lcv;
-      }
-      if (array_size <= current_max){
-        current_max = array_size;
-        dim_block = block_size_lcv;
-      }
-    }    
-
-    cl_mem inter_d;
-    cl_int ciErrNum;
-    cl_program scanLargeArray_program;
-
-    cl_kernel scan_L1_kernel;
-    cl_kernel scan_inter1_kernel;
-    cl_kernel scan_inter2_kernel;
-    cl_kernel uniformAdd;
-    
-    // allocate device memory input and output arrays
-    unsigned int *zeroData;
-    zeroData = (unsigned int *)calloc( current_max, sizeof(unsigned int) );
-    if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s)\n", __FILE__); exit(1); }
-
-    inter_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, current_max*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-    
-    free(zeroData);
-    
-    char compileOptions[128];
-    //                -cl-nv-verbose // Provides register info for NVIDIA devices
-    // Set all Macros referenced by kernels
-    sprintf(compileOptions, "\
-                -D DYN_LOCAL_MEM_SIZE=%lu",
-                EXPANDED_SIZE(dim_block)*sizeof(unsigned int)
-            );
-  
-    size_t program_length;
-    const char *source_path = "src/opencl_base/scanLargeArray.cl";
-    char *source;
-
-    // Dynamically allocate buffer for source
-    source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-      fprintf(stderr, "Could not load program source! (%s)\n", __FILE__); exit(1);
-    }
-  	
-    scanLargeArray_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-
-    free(source);
-    OCL_ERRCK_RETVAL ( clBuildProgram(scanLargeArray_program, 1, &clDevice, compileOptions, NULL, NULL) ); 
-      
-  /*
-    // Uncomment for build log from compiler for debugging
-    char *build_log;
-    size_t ret_val_size;
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-    build_log = (char *)malloc(ret_val_size+1);
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    // to be carefully, terminate with \0
-    // there's no information in the reference whether the string is 0 terminated or not
-    build_log[ret_val_size] = '\0';
-
-    fprintf(stderr, "%s\n", build_log );
-    */   
-        
-    scan_L1_kernel = clCreateKernel(scanLargeArray_program, "scan_L1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-      
-    scan_inter1_kernel = clCreateKernel(scanLargeArray_program, "scan_inter1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    scan_inter2_kernel = clCreateKernel(scanLargeArray_program, "scan_inter2_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);  
-      
-    uniformAdd = clCreateKernel(scanLargeArray_program, "uniformAdd", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 3, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 3, sizeof(cl_mem), (void *)&inter_d) );
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 4, sizeof(unsigned int), &inter_offset) );
-               
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_L1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-    }
-
-    unsigned int stride = 1;
-    for (unsigned int d = current_max; d > 1; d /= dim_block) {        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 1, sizeof(unsigned int), &stride) );
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-        
-        stride *= dim_block;
-    }
-    
-    unsigned int singleZero = 0;
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, inter_d, CL_TRUE, 
-                          (current_max-1)*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to write
-                          &singleZero, // Host Source
-                          0, NULL, NULL) );
-
-    for (unsigned int d = dim_block; d <= current_max; d *= dim_block) {
-        stride /= dim_block;
-        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 1, sizeof(unsigned int), &stride) );
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter2_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );                       
-    }
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 4, sizeof(unsigned int), &inter_offset) );
-        
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, uniformAdd, 1, 0,
-                            grid, block, 0, 0, 0) ); 
-    }
-
-    OCL_ERRCK_RETVAL ( clReleaseMemObject(inter_d) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_L1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter2_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(uniformAdd) );
-
-    OCL_ERRCK_RETVAL ( clReleaseProgram(scanLargeArray_program) );
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.h
deleted file mode 100644
index dc4ff0a04a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/scanLargeArray.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void scanLargeArray( unsigned int gridNumElements, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cl
deleted file mode 100644
index 9ee9a43178..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cl
+++ /dev/null
@@ -1,227 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable 
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LNB + (index) >> (2*LNB))
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define BLOCK_P_OFFSET (4*SORT_BS+1+(4*SORT_BS+1)/16+(4*SORT_BS+1)/64)
-
-void scan (__local unsigned int s_data[BLOCK_P_OFFSET]){
-  unsigned int thid = get_local_id(0);
-
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-
-  unsigned int stride = 2;
-  for (unsigned int d = get_local_size(0); d > 0; d >>= 1)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  if (thid == 0){
-    unsigned int last = 4*get_local_size(0)-1;
-    last += CONFLICT_FREE_OFFSET(last);
-    s_data[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))] = s_data[last];
-    s_data[last] = 0;
-  }
-
-  for (unsigned int d = 1; d <= get_local_size(0); d *= 2)
-  {
-    stride >>= 1;
-
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t  = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  unsigned int temp = s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)] = s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)];
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += temp;
-
-  unsigned int temp2 = s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-  s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))] = s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += temp2;
-
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-}
-
-__kernel void splitSort(int numElems, int iter, 
-                                 __global unsigned int* keys, 
-                                 __global unsigned int* values, 
-                                 __global unsigned int* histo)
-{
-    __local unsigned int flags[BLOCK_P_OFFSET];
-    __local unsigned int histo_s[1<<BITS];
-
-    const unsigned int tid = get_local_id(0);
-    const unsigned int gid = get_group_id(0)*4*SORT_BS+4*get_local_id(0);
-
-    // Copy input to shared mem. Assumes input is always even numbered
-    uint4 lkey = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
-    uint4 lvalue;
-    if (gid < numElems){
-      lkey = *((__global uint4*)(keys+gid));
-      lvalue = *((__global uint4*)(values+gid));
-    }
-
-    if(tid < (1<<BITS)){
-      histo_s[tid] = 0;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    atom_add(histo_s+((lkey.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-
-    uint4 index = (uint4) (4*tid, 4*tid+1, 4*tid+2, 4*tid+3);
-
-    for (int i=BITS*iter; i<BITS*(iter+1);i++){
-      const uint4 flag = (uint4) ( (lkey.x>>i)&0x1,(lkey.y>>i)&0x1,(lkey.z>>i)&0x1,(lkey.w>>i)&0x1 );
-
-      flags[index.x+CONFLICT_FREE_OFFSET(index.x)] = 1<<(16*flag.x);
-      flags[index.y+CONFLICT_FREE_OFFSET(index.y)] = 1<<(16*flag.y);
-      flags[index.z+CONFLICT_FREE_OFFSET(index.z)] = 1<<(16*flag.z);
-      flags[index.w+CONFLICT_FREE_OFFSET(index.w)] = 1<<(16*flag.w);
-
-      scan (flags);
-
-      index.x = (flags[index.x+CONFLICT_FREE_OFFSET(index.x)]>>(16*flag.x))&0xFFFF;
-      index.y = (flags[index.y+CONFLICT_FREE_OFFSET(index.y)]>>(16*flag.y))&0xFFFF;
-      index.z = (flags[index.z+CONFLICT_FREE_OFFSET(index.z)]>>(16*flag.z))&0xFFFF;
-      index.w = (flags[index.w+CONFLICT_FREE_OFFSET(index.w)]>>(16*flag.w))&0xFFFF;
-
-      unsigned short offset = flags[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))]&0xFFFF;
-      index.x += (flag.x) ? offset : 0;
-      index.y += (flag.y) ? offset : 0;
-      index.z += (flag.z) ? offset : 0;
-      index.w += (flag.w) ? offset : 0;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-    }
-
-    // Write result.
-    if (gid < numElems){
-      keys[get_group_id(0)*4*SORT_BS+index.x] = lkey.x;
-      keys[get_group_id(0)*4*SORT_BS+index.y] = lkey.y;
-      keys[get_group_id(0)*4*SORT_BS+index.z] = lkey.z;
-      keys[get_group_id(0)*4*SORT_BS+index.w] = lkey.w;
-
-      values[get_group_id(0)*4*SORT_BS+index.x] = lvalue.x;
-      values[get_group_id(0)*4*SORT_BS+index.y] = lvalue.y;
-      values[get_group_id(0)*4*SORT_BS+index.z] = lvalue.z;
-      values[get_group_id(0)*4*SORT_BS+index.w] = lvalue.w;
-    }
-    if (tid < (1<<BITS)){
-      histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)] = histo_s[tid];
-    }
-}
-
-__kernel void splitRearrange (int numElems, int iter, 
-                                __global unsigned int* keys_i, 
-                                __global unsigned int* keys_o, 
-                                __global unsigned int* values_i, 
-                                __global unsigned int* values_o, 
-                                __global unsigned int* histo){
-  __local unsigned int histo_s[(1<<BITS)];
-  __local uint array_s[4*SORT_BS];
-  int index = get_group_id(0)*4*SORT_BS + 4*get_local_id(0);
-
-  if (get_local_id(0) < (1<<BITS)){
-    histo_s[get_local_id(0)] = histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)];
-  }
-
-  uint4 mine, value;
-  if (index < numElems){
-    mine = *((__global uint4*)(keys_i+index));
-    value = *((__global uint4*)(values_i+index));
-  } else {
-    mine.x = UINT32_MAX;
-    mine.y = UINT32_MAX;
-    mine.z = UINT32_MAX;
-    mine.w = UINT32_MAX;
-  }
-  
-  uint4 masks = (uint4) ( (mine.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter) );
-
-//  ((__local uint4*)array_s)[get_local_id(0)] = masks;
-  vstore4(masks, get_local_id(0), (__local uint *)array_s);
-  
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  uint4 new_index = (uint4) ( histo_s[masks.x],histo_s[masks.y],histo_s[masks.z],histo_s[masks.w] );
-
-  int i = 4*get_local_id(0)-1;
-  
-  while (i >= 0){
-    if (array_s[i] == masks.x){
-      new_index.x++;
-      i--;
-    } else {
-      break;
-    }
-  }
-
-  new_index.y = (masks.y == masks.x) ? new_index.x+1 : new_index.y;
-  new_index.z = (masks.z == masks.y) ? new_index.y+1 : new_index.z;
-  new_index.w = (masks.w == masks.z) ? new_index.z+1 : new_index.w;
-
-  if (index < numElems){
-    keys_o[new_index.x] = mine.x;
-    values_o[new_index.x] = value.x;
-
-    keys_o[new_index.y] = mine.y;
-    values_o[new_index.y] = value.y;
-
-    keys_o[new_index.z] = mine.z;
-    values_o[new_index.z] = value.z;
-
-    keys_o[new_index.w] = mine.w;
-    values_o[new_index.w] = value.w; 
-  }  
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cpp
deleted file mode 100644
index 5154c545d1..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeysPtr, cl_mem* &dvaluesPtr, cl_mem* &dkeys_oPtr, cl_mem* &dvalues_oPtr, cl_context *clContextPtr, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes){
-  
-  size_t block[1] = { SORT_BS };
-  size_t grid[1] = { ((numElems+4*SORT_BS-1)/(4*SORT_BS)) * block[0] };
-
-  unsigned int iterations = 0;
-  while(max_value > 0){
-    max_value >>= BITS;
-    iterations++;
-  }
-
-  cl_int ciErrNum;
-  
-  cl_context clContext = *clContextPtr;
-  cl_program sort_program;
-  cl_kernel splitSort;
-  cl_kernel splitRearrange;
-  
-  cl_mem dhisto;
-  cl_mem* original = dkeysPtr;
-
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) calloc( (1<<BITS)*grid[0], sizeof(unsigned int) );
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-
-  dhisto = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (1<<BITS)*((numElems+4*SORT_BS-1)/(4*SORT_BS))*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  
-  free(zeroData);
-  
-  //char compileOptions[256];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  /*  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff,
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );*/ 
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_base/sort.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s)\n", __FILE__); exit(1);
-  }
-  	
-  sort_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(sort_program, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );  
-  
-  
-  // Uncomment to get build log from compiler for debugging
-  char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-       	
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-  
-  
-  splitSort = clCreateKernel(sort_program, "splitSort", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  splitRearrange = clCreateKernel(sort_program, "splitRearrange", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);      
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 0, sizeof(int), &numElems) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 4, sizeof(cl_mem), (void *)&dhisto) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 0, sizeof(int), &numElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 6, sizeof(cl_mem), (void *)&dhisto) );
-
-  for (int i=0; i<iterations; i++){
-  
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 1, sizeof(int), &i) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );    
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitSort, 1, 0,
-                            grid, block, 0, 0, 0) );
-    
-    scanLargeArray(((numElems+4*SORT_BS-1)/(4*SORT_BS))*(1<<BITS), dhisto, clContext, clCommandQueue, clDevice, workItemSizes);
-
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 1, sizeof(int), &i ) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitRearrange, 1, 0,
-                            grid, block, 0, 0, 0) );
-
-    cl_mem* temp = dkeysPtr;
-    dkeysPtr = dkeys_oPtr;
-    dkeys_oPtr = temp;
-
-    temp = dvaluesPtr;
-    dvaluesPtr = dvalues_oPtr;
-    dvalues_oPtr = temp;
-  }
-  
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitSort) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitRearrange) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dkeys_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dvalues_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(dhisto) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseProgram(sort_program) );
-
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.h
deleted file mode 100644
index ceea7c28ca..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_base/sort.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeys, cl_mem* &dvalues, cl_mem* &dkeys_o, cl_mem* &dvalues_o, cl_context *clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.c
deleted file mode 100644
index 87f4c0cbec..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size, 
-		 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]){
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  int gridNumElems = size_x*size_y*size_z;
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int pos = 0;
-  int* posArray = (int*) malloc (gridNumElems*sizeof(int));
-  memset(posArray, 0xFF, gridNumElems*sizeof(int));
-  (*indeces) = (int*) malloc (gridNumElems*sizeof(int));
-  (*gridData) = (cmplx*) calloc (gridNumElems,sizeof(cmplx));
-  (*sampleDensity) = (float*) calloc (gridNumElems,sizeof(float));
-
-  if (*gridData == NULL || *sampleDensity == NULL || *indeces == NULL){
-    printf("unable to allocate temporary CPU space\n");
-    exit(1);
-  }
-
-  int i;
-  for (i=0; i < CPUbin_size; i++){
-    ReconstructionSample pt = sample[CPUbin[i]];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* kernel weighting value */
-                  if (params.useLUT){
-                    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-                  } else {
-                    w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-                  }
-
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* grid data */
-                  if(posArray[idx] == -1){
-                    posArray[idx] = pos;
-                    (*indeces)[pos] = idx;
-                    pos++;
-                  }
-
-                  (*gridData)[posArray[idx]].real += (w*pt.real);
-                  (*gridData)[posArray[idx]].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  (*sampleDensity)[posArray[idx]] += 1;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  free(posArray);
-  return pos;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.h
deleted file mode 100644
index 1d883f00f7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/CPU_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size,
-                 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/GPU_kernels.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/GPU_kernels.cl
deleted file mode 100644
index 91985ec8e5..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/GPU_kernels.cl
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-#define TILE 64
-#define LOG_TILE 6
-
-typedef struct{
-  __global float2* data;
-  __global float4* loc;
-} sampleArrayStruct;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-__kernel void binning_kernel (unsigned int n, 
-                              __global ReconstructionSample* sample_g, 
-                              __global unsigned int* idxKey_g,
-                              __global unsigned int* idxValue_g, 
-                              __global unsigned int* binCount_g, 
-                              unsigned int binsize, unsigned int gridNumElems){
-  unsigned int key;
-  unsigned int sampleIdx = get_group_id(0)*get_local_size(0) + get_local_id(0); //blockIdx.x*blockDim.x+threadIdx.x;
-  ReconstructionSample pt;
-  unsigned int binIdx;
-  unsigned int count;
-
-  if (sampleIdx < n){
-    pt = sample_g[sampleIdx];
-
-    binIdx = (unsigned int)(pt.kZ)*SIZE_XY_VAL + (unsigned int)(pt.kY)*GRIDSIZE_VAL1 + (unsigned int)(pt.kX);
-    if (binCount_g[binIdx]<binsize){
-      count = atom_add(binCount_g+binIdx, 1);
-      if (count < binsize){
-        key = binIdx;
-      } else {
-        atom_sub(binCount_g+binIdx, 1);
-        key = gridNumElems;
-      }
-    } else {
-      key = gridNumElems;
-    }
-
-    idxKey_g[sampleIdx] = key;
-    idxValue_g[sampleIdx] = sampleIdx;
-  }
-}
-
-__kernel void reorder_kernel(int n, 
-                               __global unsigned int* idxValue_g, 
-                               __global ReconstructionSample* samples_g, 
- //                              sampleArrayStruct sortedSampleSoA_g
-                               
-                               __global float2* dataptr_g,
-                               unsigned int f2_offset
-//                               __global float4* locptr_g
-                               
-                               ){
-  unsigned int index = get_group_id(0)*get_local_size(0) + get_local_id(0);
-  unsigned int old_index;
-  ReconstructionSample pt;
-
-  if (index < n){
-    old_index = idxValue_g[index];
-    pt = samples_g[old_index];
-
-    float2 data = (float2) (pt.real, pt.imag);
-    float4 loc = (float4) (pt.kX, pt.kY, pt.kZ, pt.sdc);
-
- //   sortedSampleSoA_g.data[index] = data;
- //   sortedSampleSoA_g.loc[index] = loc;
-    
-    dataptr_g[index] = data;
-    ((__global float4*)(dataptr_g+f2_offset))[index] = loc;
-    
-  }
-}
-
-float kernel_value(float v){
-
-  float rValue = 0;
-
-  float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-                (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-                 0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-                 0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-                 0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-                 0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-                 0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-                 0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-                 0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = native_divide(-num,den);
-  //rValue = __fdividef(-num,den);
-
-  return rValue;
-}
-
-__kernel void gridding_GPU (//sampleArrayStruct sortedSampleSoA_g, 
-                               __global float2* dataptr_g,
-                               unsigned int f2_offset,
-//                               __global float4* locptr_g
-
-                              __global unsigned int* binStartAddr_g, 
-                              __global float2* gridData_g, 
-                              __global float* sampleDensity_g, 
-                              float beta){
-  __local float real_s[TILE];
-  __local float imag_s[TILE];
-  __local float kx_s[TILE];
-  __local float ky_s[TILE];
-  __local float kz_s[TILE];
-  __local float sdc_s[TILE];
-
-  const int flatIdx = 
-  get_local_id(2)*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0);
-  //threadIdx.z*blockDim.y*blockDim.x+threadIdx.y*blockDim.x+threadIdx.x;
-
-  // figure out starting point of the tile
-  const int z0 = (4*get_local_size(2))*(get_group_id(1)/(GRIDSIZE_VAL2/get_local_size(1)));
-  const int y0 = get_local_size(1)*(get_group_id(1)%(GRIDSIZE_VAL2/get_local_size(1)));
-  const int x0 = get_group_id(0)*get_local_size(0);
-
-  const int X  = x0+get_local_id(0);
-  const int Y  = y0+get_local_id(1);
-  const int Z  = z0+get_local_id(2);
-  const int Z1 = Z+get_local_size(2);
-  const int Z2 = Z1+get_local_size(2);
-  const int Z3 = Z2+get_local_size(2);
-
-  const int xl = x0-CEIL_CUTOFF_VAL;
-  const int xL = (xl < 0) ? 0 : xl;
-  const int xh = x0+get_local_size(0)+CUTOFF_VAL;
-  const int xH = (xh >= GRIDSIZE_VAL1) ? GRIDSIZE_VAL1-1 : xh;
-
-  const int yl = y0-CEIL_CUTOFF_VAL;
-  const int yL = (yl < 0) ? 0 : yl;
-  const int yh = y0+get_local_size(1)+CUTOFF_VAL;
-  const int yH = (yh >= GRIDSIZE_VAL2) ? GRIDSIZE_VAL2-1 : yh;
-
-  const int zl = z0-CEIL_CUTOFF_VAL;
-  const int zL = (zl < 0) ? 0 : zl;
-  const int zh = z0+(4*get_local_size(2))+CUTOFF_VAL;
-  const int zH = (zh >= GRIDSIZE_VAL3) ? GRIDSIZE_VAL3-1 : zh;
-
-  const int idx = Z*SIZE_XY_VAL + Y*GRIDSIZE_VAL1 + X;
-  const int idx1 = idx+get_local_size(2)*SIZE_XY_VAL;
-  const int idx2 = idx1+get_local_size(2)*SIZE_XY_VAL;
-  const int idx3 = idx2+get_local_size(2)*SIZE_XY_VAL;
-
-  float2 pt = (float2) (0.0f, 0.0f);
-  float density = 0.0f;
-
-  float2 pt1 = (float2) (0.0f, 0.0f);
-  float density1 = 0.0f;  
-
-  float2 pt2 = (float2) (0.0f, 0.0f);
-  float density2 = 0.0f;
-
-  float2 pt3 = (float2) (0.0f, 0.0f);
-  float density3 = 0.0f;
-
-  for (int z = zL; z <= zH; z++){
-    for (int y = yL; y <= yH; y++){
-      __global const unsigned int *addr = binStartAddr_g+z*SIZE_XY_VAL+ y*GRIDSIZE_VAL1;
-      const unsigned int start = *(addr+xL);
-      const unsigned int end   = *(addr+xH+1);
-      const unsigned int delta = end-start;
-      for (int x = 0; x < ((delta+TILE-1)>>LOG_TILE); x++){
-        int tileSize = ((delta-(x<<LOG_TILE)) > TILE) ? TILE : (delta-(x<<LOG_TILE));
-        int globalIdx = flatIdx+(x<<LOG_TILE);
-        barrier(CLK_LOCAL_MEM_FENCE );
-        if(flatIdx < tileSize){
-          //const float2 data = sortedSampleSoA_g.data[start+globalIdx];
-          //const float4 loc  = sortedSampleSoA_g.loc [start+globalIdx];
-          
-          const float2 data = dataptr_g[start+globalIdx];          
-          const float4 loc  = ((__global float4*)(dataptr_g+f2_offset))[start+globalIdx];                       
-
-          real_s[flatIdx] = data.x;
-          imag_s[flatIdx] = data.y;
-          kx_s  [flatIdx] = loc.x;
-          ky_s  [flatIdx] = loc.y;
-          kz_s  [flatIdx] = loc.z;
-          sdc_s [flatIdx] = loc.w;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE );
-
-        for (int j=0; j< tileSize; j++){
-          const float real = real_s[j];
-          const float imag = imag_s[j];
-          const float sdc = sdc_s[j];
-
-          if((real != 0.0f || imag != 0.0f) && sdc != 0.0f){
-            float v0 = (kx_s[j]-X)*(kx_s[j]-X);
-            v0 += (ky_s[j]-Y)*(ky_s[j]-Y);
-
-            const float v = v0 + (kz_s[j]-Z)*(kz_s[j]-Z);
-            if(v<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt.x += w*real;
-              pt.y += w*imag;
-              density += 1.0f;
-            }
-
-            const float v1 = v0 + (kz_s[j]-Z1)*(kz_s[j]-Z1);
-            if(v1<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v1*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt1.x += w*real;
-              pt1.y += w*imag;
-              density1 += 1.0f;
-            }
-
-            const float v2 = v0 + (kz_s[j]-Z2)*(kz_s[j]-Z2);
-            if(v2<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v2*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt2.x += w*real;
-              pt2.y += w*imag;
-              density2 += 1.0f;
-            }
-
-            const float v3 = v0 + (kz_s[j]-Z3)*(kz_s[j]-Z3);
-            if(v3<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v3*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt3.x += w*real;
-              pt3.y += w*imag;
-              density3 += 1.0f;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  gridData_g[idx] = pt;
-  sampleDensity_g[idx] = density;
-
-  gridData_g[idx1] = pt1;
-  sampleDensity_g[idx1] = density1;
-
-  gridData_g[idx2] = pt2;
-  sampleDensity_g[idx2] = density2;
-
-  gridData_g[idx3] = pt3;
-  sampleDensity_g[idx3] = density3;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/Makefile
deleted file mode 100644
index 46bafdc413..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=CPU_kernels.o main.o OpenCL_interface.o scanLargeArray.o sort.o OpenCL_common.o
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.cpp
deleted file mode 100644
index 57368eda9a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.h
deleted file mode 100644
index b063d9c696..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.cpp
deleted file mode 100644
index 5eb7d2a420..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "scanLargeArray.h"
-#include "CPU_kernels.h"
-
-#include "sort.h"
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-extern char *oclOverhead;
-
-#define PI 3.14159265359
-
-typedef struct{
-  cl_float2* data;
-  cl_float4* loc;
-} sampleArrayStruct;
-
-// Compare function used for Qsort for CPU computation
-int compare (const void * a, const void * b)
-{
-  return ( *(int*)a - *(int*)b );
-}
-
-/***********************************************************************
- * CUDA_interface is the main function for GPU execution. This
- * implementation uses compact binning to distribute input elements
- * into unit-cubed sized bins. The bins are then visited by GPU
- * threads, where every thread computes the value of one (or small set)
- * of output elements by computing the contributions of elements in 
- * neighboring bins to these output elements.
- *
- * The bins have a limited bin size and everything beyond that bin size
- * is offloaded to the CPU to be computed in parallel with the GPU
- * gridding.
- ***********************************************************************/
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  cl_context clContext,
-  cl_command_queue clCommandQueue, //const cl_device clDevice
-  const cl_device_id clDevice,
-  size_t *workItemSizes
-){
-
-  /* Initializing all variables */
-  int dims[3] = {8,4,2}; //size of a gridding block on the GPU
-  size_t blockSize = workItemSizes[0];
-
-  /* x, y, z dimensions of the output grid (gridData) */
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-  int size_xy = size_y*size_x;
-
-  int gridNumElems = size_x * size_y * size_z;  // Total number of grid points
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  float cutoff = float(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  // Padding used to align the structure of arrays used for the sorted input elements
-  int npad = 0;
-  if (n % 64 != 0){
-    npad = 64 - (n%64);
-  }
-
-  /* Declarations of host data structures */
-  cmplx* gridData_CPU;
-  float* sampleDensity_CPU;
-  int* indices_CPU;
-
-  /* Declarations of device data structures */
-  cl_int ciErrNum;
-  cl_mem sample_d;    // Device array for original input array
-  cl_mem sortedSample_d;             // Device array of the sorted (into bins) input elements.
-                                            // This array is accessed by sortedSampleSoA_d in a structure
-                                            //   of arrays manner.
-  cl_mem gridData_d;                // Device array for output grid
-  cl_mem sampleDensity_d;            // Device array for output sample density
-  cl_mem idxKey_d;            // Array of bin indeces generated in the binning kernel
-                                            //   and used to sort the input elements into their
-                                            //   corresponding bins
-  cl_mem idxValue_d;          // This array holds the indices of input elements in the
-                                            //   the original array. This array is sorted using the
-                                            //   the idxKey_d array, and once sorted, it is used in
-                                            //   the reorder kernel to move the actual elements into
-                                            //   their corresponding bins.
-  sampleArrayStruct sortedSampleSoA_d;      // Structure of Arrays which holds the sorted input elements.
-                                            //   Uses sortedSample_d as the underlying physical data
-                                            //   structures
-  //cl_mem binCount_d;          // Zero-initialized array which counts the number of elements
-                                            //   put in each bin. Based on this array, we determine which
-                                            //   elements get offloaded to the CPU
-  cl_mem binStartAddr_d;      // Array of start offset of each of the compact bins
-
-  cl_mem *idxValue_dPtr;
-  cl_mem *idxKey_dPtr;
-  
-  cl_program gpu_kernels;
-  cl_kernel binning_kernel;
-  cl_kernel reorder_kernel;
-  cl_kernel gridding_GPU;
-
-  /* Allocating device memory */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  unsigned int *zeroData = NULL, *maxIntData = NULL;
-  
-  size_t sizeZeroData = sizeof(float)* 2 * gridNumElems;
-  if ( n*sizeof(ReconstructionSample) > sizeZeroData) {
-    sizeZeroData = n*sizeof(ReconstructionSample);
-  }    
-  if ( (sizeof(unsigned int) * (gridNumElems+1)) > sizeZeroData) {
-    // Not going to be taken, but included just in case since this is used for multiple variables
-    sizeZeroData = sizeof(unsigned int) * (gridNumElems+1);
-  }
-  if ( (((n+3)/4)*4)*sizeof(unsigned int) > sizeZeroData) {
-    sizeZeroData = (((n+3)/4)*4)*sizeof(unsigned int);
-  }
-  
-  zeroData = (unsigned int *) malloc(sizeZeroData);
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  maxIntData = (unsigned int *) malloc((((n+3)/4)*4)*sizeof(unsigned int));
-  if (maxIntData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  
-  memset(zeroData, 0, sizeZeroData);
-  memset(maxIntData+n, 0xFF, (((n+3)&~(3))-n)*sizeof(unsigned int));
-
-  sortedSample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (n+npad)*sizeof(ReconstructionSample), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  binStartAddr_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (gridNumElems+1)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n*sizeof(ReconstructionSample), sample, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  idxKey_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), maxIntData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //Pad to nearest multiple of 4 to 
-  idxValue_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //satisfy a property of the sorting kernel.
-
-  idxKey_dPtr = &idxKey_d;
-  idxValue_dPtr = &idxValue_d;
-  
-  free(maxIntData);
-  
-  pb_SwitchToSubTimer(timers, oclOverhead, pb_TimerID_KERNEL);
-
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f -D CEIL_CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff, ceil(cutoff),
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_nvidia/GPU_kernels.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s) \n", __FILE__); exit(1);
-  }  
-  	
-  gpu_kernels = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(gpu_kernels, 1, &clDevice, compileOptions, NULL, NULL) );
-  
-  /*
-  char *build_log;
-  size_t ret_val_size;
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-  build_log = (char *)malloc(ret_val_size+1);
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  OCL_ERRCK_VAR(ciErrNum);
-       	
-
-  // to be careful, terminate with \0
-  // there's no information in the reference whether the string is 0 terminated or not
-  build_log[ret_val_size] = '\0';
-
-  fprintf(stderr, "%s\n", build_log );
-  */
-  
-  binning_kernel = clCreateKernel(gpu_kernels, "binning_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  reorder_kernel = clCreateKernel(gpu_kernels, "reorder_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  gridding_GPU = clCreateKernel(gpu_kernels, "gridding_GPU", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  //sortedSampleSoA_d.data = (cl_float2*)(sortedSample_d);
-  //sortedSampleSoA_d.loc = (cl_float4*)(((float*)sortedSample_d)+2*(n+npad));
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 1, sizeof(cl_mem), (void *)&sample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 2, sizeof(cl_mem), (void *)idxKey_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 3, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 4, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 5, sizeof(int), &(params.binsize)) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 6, sizeof(unsigned int), &gridNumElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 2, sizeof(cl_mem), (void *)&sample_d) );      
-
-  unsigned int num_float2_offset = (n+npad);
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 3, sizeof(cl_mem), (void *)&sortedSample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 4, sizeof(unsigned int), &num_float2_offset) );
-  
-  size_t block1[1] = { blockSize };
-  size_t grid1[1] = { ((n+blockSize-1)/blockSize)*block1[0] };
-    
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 1: Perform binning. This kernel determines which output bin each input element
-   * goes into. Any excess (beyond binsize) is put in the CPU bin
-   */
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, binning_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  /* STEP 2: Sort the index-value pair generate in the binning kernel */
-  cl_mem dkeys_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  cl_mem dvalues_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-    
-  cl_mem *dkeys_oPtr = &dkeys_o;
-  cl_mem *dvalues_oPtr = &dvalues_o;
-  
-  cl_mem *beforePointer = idxKey_dPtr;
-
-  sort(n, gridNumElems+1, idxKey_dPtr, idxValue_dPtr, dkeys_oPtr, dvalues_oPtr, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  /* STEP 3: Reorder the input data, based on the sorted values from Step 2.
-   * this step also involves changing the data from array of structs to a struct
-   * of arrays. Also in this kernel, we populate an array with the starting index
-   * of every output bin features in the input array, based on the sorted indices 
-   * from Step 2.
-   * At the end of this step, we copy the start address and list of input elements
-   * that will be computed on the CPU.
-   */
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 1, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, reorder_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxKey_dPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sample_d) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 4: In this step we generate the ADD scan of the array of starting indices
-   * of the output bins. The result is an array that contains the starting address of
-   * every output bin.
-   */
-  scanLargeArray(gridNumElems+1, binStartAddr_d, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  // Copy back to the CPU the indices of the input elements that will be processed on the CPU
-  unsigned int cpuStart;    
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, binStartAddr_d, CL_TRUE, 
-                          gridNumElems*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to read
-                          &cpuStart, // Host Source
-                          0, NULL, NULL) );
-
-  int CPUbin_size = int(n)-int(cpuStart);
-
-  int* CPUbin;
-  CPUbin = (int *) malloc(CPUbin_size*sizeof(unsigned int));
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, *idxValue_dPtr, CL_TRUE, 
-                          cpuStart*sizeof(unsigned int), // Offset in bytes
-                          CPUbin_size*sizeof(unsigned int), // Size of data to read
-                          CPUbin, // Host Source
-                          0, NULL, NULL) );
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxValue_dPtr) );
-
-  /* STEP 5: Perform the binning on the GPU. The results are computed in a gather fashion
-   * where each thread computes the value of one output element by reading the relevant
-   * bins.
-   */
-  gridData_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(cmplx), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sampleDensity_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(float), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-
-  free(zeroData);
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  size_t block2[3] = {dims[0], dims[1], dims[2]};
-  size_t grid2[3] = { (size_x/dims[0]) * block2[0], ((size_y*size_z)/(4*dims[1]*dims[2])) * block2[1], 1 * block2[2] };
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 0, sizeof(cl_mem), (void *)&sortedSample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 1, sizeof(unsigned int), &num_float2_offset) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 2, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 3, sizeof(cl_mem), (void *)&gridData_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 4, sizeof(cl_mem), (void *)&sampleDensity_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 5, sizeof(float), &beta) );
-
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, gridding_GPU, 3, 0,
-                            grid2, block2, 0, 0, 0) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  qsort(CPUbin, CPUbin_size, sizeof(int), compare); //Sorting helps cache locality of input element array
-  int num = gridding_CPU(n, params, sample, CPUbin, CPUbin_size, LUT, sizeLUT, &gridData_CPU, &sampleDensity_CPU, &indices_CPU);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  /* Copying the results from the Device to the Host */
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, sampleDensity_d, CL_TRUE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(float), // Size of data to write
-                          sampleDensity, // Host Source
-                          0, NULL, NULL) );                          
-                          
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, gridData_d, CL_TRUE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(cmplx), // Size of data to write
-                          gridData, // Host Source
-                          0, NULL, NULL) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  /* STEP 6: Computing the contributions of the sample points handled by the Host
-   * and adding those to the GPU results.
-   */
-  for (int i=0; i< num; i++){
-    gridData[indices_CPU[i]].real += gridData_CPU[i].real;
-    gridData[indices_CPU[i]].imag += gridData_CPU[i].imag;
-    sampleDensity[indices_CPU[i]] += sampleDensity_CPU[i];
-  }
-
-  if (gridData_CPU != NULL){
-    free(indices_CPU);
-    free(gridData_CPU);
-    free(sampleDensity_CPU);
-  }
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  free(CPUbin);
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(gridData_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sampleDensity_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(binStartAddr_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sortedSample_d) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_NONE);
-
-  return;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.h
deleted file mode 100644
index 0d39e9bb1a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/OpenCL_interface.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  const cl_context clContext,  // Pointer to OpenCL Context created by Host
-  const cl_command_queue clCommandQueue,
-  const cl_device_id clDevice,
-  size_t *workItemSizes
-);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/main.cpp
deleted file mode 100644
index a30172fd61..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/main.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "OpenCL_interface.h"
-#include "OpenCL_common.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-char *oclOverhead = "OpenCL Overhead";
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-  cl_device_id clDevice;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-  if (deviceFound < 0) {
-    fprintf(stderr, "No suitable device was found\n");
-    exit(1);
-  }
-  cl_ulong mem_size;
-  clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
-
-  printf("  Number of samples = %d\n", p->numSamples);
-  printf("  Total amount of GPU memory: %llu bytes\n", (unsigned long long) mem_size);
-  if (p->numSamples > 10000000 && mem_size/1024/1024 < 3000) {
-    printf("  Need at least 3GB of GPU memory for large dataset\n");
-    exit(1);
-  }
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = float(params.aquisitionMatrixSize[0])/(float(params.reconstructionMatrixSize[0])*float(params.kMax[0]));
-  kScale[1] = float(params.aquisitionMatrixSize[1])/(float(params.reconstructionMatrixSize[1])*float(params.kMax[1]));
-  kScale[2] = float(params.aquisitionMatrixSize[2])/(float(params.reconstructionMatrixSize[2])*float(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  for(int n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-  
-  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples; //Input Data
-//  cl_mem samplesPin; 
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  cmplx* gridData; //Output Data
-  float* sampleDensity; //Output Data
-//  cl_mem gridDataPin;
-//  cl_mem sampleDensityPin;
-
-  cmplx* gridData_gold; //Gold Output Data
-  float* sampleDensity_gold; //Gold Output Data
-  
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-  cl_device_id clDevice;
-  cl_context clContext;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-
-  size_t max_alloc_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, 0);
-  size_t global_mem_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &global_mem_size, 0);
-
-  size_t samples_size = params.numSamples*sizeof(ReconstructionSample);
-  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
-  size_t output_size = gridNumElems*sizeof(cmplx);
-
-  if ( (deviceFound < 0) ||
-       ((samples_size+output_size) > global_mem_size) ||
-       (samples_size > max_alloc_size) || 
-       (output_size > max_alloc_size ) ) {
-    fprintf(stderr, "No suitable device was found\n");
-    if(deviceFound >= 0) {
-      fprintf(stderr, "Memory requirements for this dataset exceed device capabilities\n");
-    }
-    exit(1);
-  }
-  
-  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  cl_uint workItemDimensions;
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
-  
-  size_t workItemSizes[workItemDimensions];
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
-  
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-    
-    /*
-  samplesPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      params.numSamples*sizeof(ReconstructionSample),
-      NULL, &ciErrNum);
-*/
-  samples = (ReconstructionSample *) malloc ( params.numSamples*sizeof(ReconstructionSample) );
-  
-  /*(ReconstructionSample *) clEnqueueMapBuffer(clCommandQueue, samplesPin, CL_TRUE, CL_MAP_WRITE, 0, params.numSamples*sizeof(ReconstructionSample), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-*/
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate and map memory for input data\n");
-    exit(1);
-  }
-
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
-  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
-  if (sampleDensity_gold == NULL || gridData_gold == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running gold version\n");
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);
-
-  printf("Running OpenCL version\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-/*
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, samplesPin, CL_TRUE, 
-                          0, // Offset in bytes
-                          n*sizeof(ReconstructionSample), // Size of data to write
-                          samples, // Host Source
-  
-                          0, NULL, NULL) );*/
- // OCL_ERRCK_RETVAL ( clFinish(clCommandQueue) );
- 
- /*
-  gridDataPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(cmplx), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  gridData = (cmplx *) malloc ( gridNumElems*sizeof(cmplx) );
-  if (gridData == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(cmplx *) clEnqueueMapBuffer(clCommandQueue, gridDataPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(cmplx), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  /*
-  sampleDensityPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(float), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  sampleDensity = (float *) malloc ( gridNumElems*sizeof(float) );
-  if (sampleDensity == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(float *) clEnqueueMapBuffer(clCommandQueue, sampleDensityPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(float), 0, NULL, NULL, &ciErrNum);
-  */
-  
-  OCL_ERRCK_VAR(ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  
-  //Interface function to GPU implementation of gridding
-  OpenCL_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  int passed=1;
-  for (int i=0; i<gridNumElems; i++){
-    if(sampleDensity[i] != sampleDensity_gold[i]) {
-      passed=0;
-      break;
-    }
-  }
-  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  
-  /*
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, samplesPin, samples, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, gridDataPin, gridData, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, sampleDensityPin, sampleDensity, 0, NULL, NULL) );
-  
-  clReleaseMemObject(samplesPin);
-  clReleaseMemObject(gridDataPin);
-  clReleaseMemObject(sampleDensityPin);
-  */
-  
-  free(samples);
-  free(gridData);
-  free(sampleDensity);
-  
-  
-  free(gridData_gold);
-  free(sampleDensity_gold);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cl
deleted file mode 100644
index c45978a38b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cl
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#define BLOCK_SIZE 1024
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
-#define LNB LOG_NUM_BANKS
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernels
-////////////////////////////////////////////////////////////////////////////////
-__kernel void scan_L1_kernel(unsigned int n, __global unsigned int* dataBase, unsigned int data_offset, __global unsigned int* interBase, unsigned int inter_offset)
-{
-    __local unsigned int s_data[EXPANDED_SIZE(BLOCK_SIZE)]; 
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-
-    unsigned int thid = get_local_id(0);
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    unsigned int s_ai = thid;
-    unsigned int s_bi = thid + get_local_size(0);
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = (g_ai < n) ? data[g_ai] : 0;
-    s_data[s_bi] = (g_bi < n) ? data[g_bi] : 0;
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        s_data[bi] += s_data[ai];
-      }
-
-        stride *= 2;
-    }
-
-    if (thid == 0) {
-      unsigned int last = get_local_size(0)*2 -1;
-      last += CONFLICT_FREE_OFFSET(last);
-      inter[get_group_id(0)] = s_data[last];
-      s_data[last] = 0;
-    }
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    if (g_ai < n) { data[g_ai] = s_data[s_ai]; }
-    if (g_bi < n) { data[g_bi] = s_data[s_bi]; }
-}
-
-
-
-__kernel void scan_inter1_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-        s_data[bi] += s_data[ai];
-      }
-
-      stride *= 2;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__kernel void scan_inter2_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = get_local_size(0)*2;
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-
-__kernel void uniformAdd(unsigned int n, __global unsigned int *dataBase, unsigned int data_offset, __global unsigned int *interBase, unsigned int inter_offset)
-{
-    __local unsigned int uni;
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-       
-    if (get_local_id(0) == 0) { uni = inter[get_group_id(0)]; }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    if (g_ai < n) { data[g_ai] += uni; }
-    if (g_bi < n) { data[g_bi] += uni; }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cpp
deleted file mode 100644
index 9816308d0c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <CL/cl.h>
-#include "OpenCL_common.h"
-
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-void scanLargeArray( unsigned int gridNumElems, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes) {
-
-    size_t blockSize = (workItemSizes[0]*2 < 1024) ? workItemSizes[0]*2 : 1024;
-
-    // Run the prescan
-    unsigned int size = (gridNumElems+blockSize-1)/blockSize;
-    
-    unsigned int dim_block;
-    unsigned int current_max = size*blockSize;
-    for (int block_size_lcv = 128; block_size_lcv <= blockSize; block_size_lcv *= 2){
-      unsigned int array_size = block_size_lcv;
-      while(array_size < size){
-        array_size *= block_size_lcv;
-      }
-      if (array_size <= current_max){
-        current_max = array_size;
-        dim_block = block_size_lcv;
-      }
-    }    
-
-    cl_mem inter_d;
-    cl_int ciErrNum;
-    cl_program scanLargeArray_program;
-
-    cl_kernel scan_L1_kernel;
-    cl_kernel scan_inter1_kernel;
-    cl_kernel scan_inter2_kernel;
-    cl_kernel uniformAdd;
-    
-    // allocate device memory input and output arrays
-    unsigned int *zeroData;
-    zeroData = (unsigned int *)calloc( current_max, sizeof(unsigned int) );
-    if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s)\n", __FILE__); exit(1); }
-
-    inter_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, current_max*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-    
-    free(zeroData);
-    
-    char compileOptions[128];
-    //                -cl-nv-verbose // Provides register info for NVIDIA devices
-    // Set all Macros referenced by kernels
-    sprintf(compileOptions, "\
-                -D DYN_LOCAL_MEM_SIZE=%lu",
-                EXPANDED_SIZE(dim_block)
-            );
-  
-    size_t program_length;
-    const char *source_path = "src/opencl_nvidia/scanLargeArray.cl";
-    char *source;
-
-    // Dynamically allocate buffer for source
-    source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-      fprintf(stderr, "Could not load program source! (%s)\n", __FILE__); exit(1);
-    }
-  	
-    scanLargeArray_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-
-    free(source);
-    OCL_ERRCK_RETVAL ( clBuildProgram(scanLargeArray_program, 1, &clDevice, compileOptions, NULL, NULL) ); 
-      
-  /*
-    // Uncomment for build log from compiler for debugging
-    char *build_log;
-    size_t ret_val_size;
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-    build_log = (char *)malloc(ret_val_size+1);
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    // to be carefully, terminate with \0
-    // there's no information in the reference whether the string is 0 terminated or not
-    build_log[ret_val_size] = '\0';
-
-    fprintf(stderr, "%s\n", build_log );
-    */   
-        
-    scan_L1_kernel = clCreateKernel(scanLargeArray_program, "scan_L1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-      
-    scan_inter1_kernel = clCreateKernel(scanLargeArray_program, "scan_inter1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    scan_inter2_kernel = clCreateKernel(scanLargeArray_program, "scan_inter2_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);  
-      
-    uniformAdd = clCreateKernel(scanLargeArray_program, "uniformAdd", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 3, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 3, sizeof(cl_mem), (void *)&inter_d) );
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 4, sizeof(unsigned int), &inter_offset) );
-               
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_L1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-    }
-
-    unsigned int stride = 1;
-    for (unsigned int d = current_max; d > 1; d /= dim_block) {        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 1, sizeof(unsigned int), &stride) );
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-        
-        stride *= dim_block;
-    }
-    
-    unsigned int singleZero = 0;
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, inter_d, CL_TRUE, 
-                          (current_max-1)*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to write
-                          &singleZero, // Host Source
-                          0, NULL, NULL) );
-
-    for (unsigned int d = dim_block; d <= current_max; d *= dim_block) {
-        stride /= dim_block;
-        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 1, sizeof(unsigned int), &stride) );
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter2_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );                       
-    }
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 4, sizeof(unsigned int), &inter_offset) );
-        
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, uniformAdd, 1, 0,
-                            grid, block, 0, 0, 0) ); 
-    }
-
-    OCL_ERRCK_RETVAL ( clReleaseMemObject(inter_d) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_L1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter2_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(uniformAdd) );
-
-    OCL_ERRCK_RETVAL ( clReleaseProgram(scanLargeArray_program) );
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.h
deleted file mode 100644
index dc4ff0a04a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/scanLargeArray.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void scanLargeArray( unsigned int gridNumElements, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cl
deleted file mode 100644
index 2da677119c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cl
+++ /dev/null
@@ -1,225 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LNB + (index) >> (2*LNB))
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define BLOCK_P_OFFSET (4*SORT_BS+1+(4*SORT_BS+1)/16+(4*SORT_BS+1)/64)
-
-void scan (__local unsigned int s_data[BLOCK_P_OFFSET]){
-  unsigned int thid = get_local_id(0);
-
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-
-  unsigned int stride = 2;
-  for (unsigned int d = get_local_size(0); d > 0; d >>= 1)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  if (thid == 0){
-    unsigned int last = 4*get_local_size(0)-1;
-    last += CONFLICT_FREE_OFFSET(last);
-    s_data[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))] = s_data[last];
-    s_data[last] = 0;
-  }
-
-  for (unsigned int d = 1; d <= get_local_size(0); d *= 2)
-  {
-    stride >>= 1;
-
-    barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t  = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-  barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-  unsigned int temp = s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)] = s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)];
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += temp;
-
-  unsigned int temp2 = s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-  s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))] = s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += temp2;
-
-  barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-}
-
-__kernel void splitSort(int numElems, int iter, 
-                                 __global unsigned int* keys, 
-                                 __global unsigned int* values, 
-                                 __global unsigned int* histo)
-{
-    __local unsigned int flags[BLOCK_P_OFFSET];
-    __local unsigned int histo_s[1<<BITS];
-
-    const unsigned int tid = get_local_id(0);
-    const unsigned int gid = get_group_id(0)*4*SORT_BS+4*get_local_id(0);
-
-    // Copy input to shared mem. Assumes input is always even numbered
-    uint4 lkey = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
-    uint4 lvalue;
-    if (gid < numElems){
-      lkey = *((__global uint4*)(keys+gid));
-      lvalue = *((__global uint4*)(values+gid));
-    }
-
-    if(tid < (1<<BITS)){
-      histo_s[tid] = 0;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    atom_add(histo_s+((lkey.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-
-    uint4 index = (uint4) (4*tid, 4*tid+1, 4*tid+2, 4*tid+3);
-
-    for (int i=BITS*iter; i<BITS*(iter+1);i++){
-      const uint4 flag = (uint4) ( (lkey.x>>i)&0x1,(lkey.y>>i)&0x1,(lkey.z>>i)&0x1,(lkey.w>>i)&0x1 );
-
-      flags[index.x+CONFLICT_FREE_OFFSET(index.x)] = 1<<(16*flag.x);
-      flags[index.y+CONFLICT_FREE_OFFSET(index.y)] = 1<<(16*flag.y);
-      flags[index.z+CONFLICT_FREE_OFFSET(index.z)] = 1<<(16*flag.z);
-      flags[index.w+CONFLICT_FREE_OFFSET(index.w)] = 1<<(16*flag.w);
-
-      scan (flags);
-
-      index.x = (flags[index.x+CONFLICT_FREE_OFFSET(index.x)]>>(16*flag.x))&0xFFFF;
-      index.y = (flags[index.y+CONFLICT_FREE_OFFSET(index.y)]>>(16*flag.y))&0xFFFF;
-      index.z = (flags[index.z+CONFLICT_FREE_OFFSET(index.z)]>>(16*flag.z))&0xFFFF;
-      index.w = (flags[index.w+CONFLICT_FREE_OFFSET(index.w)]>>(16*flag.w))&0xFFFF;
-
-      unsigned short offset = flags[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))]&0xFFFF;
-      index.x += (flag.x) ? offset : 0;
-      index.y += (flag.y) ? offset : 0;
-      index.z += (flag.z) ? offset : 0;
-      index.w += (flag.w) ? offset : 0;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-    }
-
-    // Write result.
-    if (gid < numElems){
-      keys[get_group_id(0)*4*SORT_BS+index.x] = lkey.x;
-      keys[get_group_id(0)*4*SORT_BS+index.y] = lkey.y;
-      keys[get_group_id(0)*4*SORT_BS+index.z] = lkey.z;
-      keys[get_group_id(0)*4*SORT_BS+index.w] = lkey.w;
-
-      values[get_group_id(0)*4*SORT_BS+index.x] = lvalue.x;
-      values[get_group_id(0)*4*SORT_BS+index.y] = lvalue.y;
-      values[get_group_id(0)*4*SORT_BS+index.z] = lvalue.z;
-      values[get_group_id(0)*4*SORT_BS+index.w] = lvalue.w;
-    }
-    if (tid < (1<<BITS)){
-      histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)] = histo_s[tid];
-    }
-}
-
-__kernel void splitRearrange (int numElems, int iter, 
-                                __global unsigned int* keys_i, 
-                                __global unsigned int* keys_o, 
-                                __global unsigned int* values_i, 
-                                __global unsigned int* values_o, 
-                                __global unsigned int* histo){
-  __local unsigned int histo_s[(1<<BITS)];
-  __local unsigned int array_s[4*SORT_BS];
-  int index = get_group_id(0)*4*SORT_BS + 4*get_local_id(0);
-
-  if (get_local_id(0) < (1<<BITS)){
-    histo_s[get_local_id(0)] = histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)];
-  }
-
-  uint4 mine, value;
-  if (index < numElems){
-    mine = *((__global uint4*)(keys_i+index));
-    value = *((__global uint4*)(values_i+index));
-  } else {
-    mine.x = UINT32_MAX;
-    mine.y = UINT32_MAX;
-    mine.z = UINT32_MAX;
-    mine.w = UINT32_MAX;
-  }
-  
-  uint4 masks = (uint4) ( (mine.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter) );
-
-  ((__local uint4*)array_s)[get_local_id(0)] = masks;
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  uint4 new_index = (uint4) ( histo_s[masks.x],histo_s[masks.y],histo_s[masks.z],histo_s[masks.w] );
-
-  int i = 4*get_local_id(0)-1;
-  
-  while (i >= 0){
-    if (array_s[i] == masks.x){
-      new_index.x++;
-      i--;
-    } else {
-      break;
-    }
-  }
-
-  new_index.y = (masks.y == masks.x) ? new_index.x+1 : new_index.y;
-  new_index.z = (masks.z == masks.y) ? new_index.y+1 : new_index.z;
-  new_index.w = (masks.w == masks.z) ? new_index.z+1 : new_index.w;
-
-  if (index < numElems){
-    keys_o[new_index.x] = mine.x;
-    values_o[new_index.x] = value.x;
-
-    keys_o[new_index.y] = mine.y;
-    values_o[new_index.y] = value.y;
-
-    keys_o[new_index.z] = mine.z;
-    values_o[new_index.z] = value.z;
-
-    keys_o[new_index.w] = mine.w;
-    values_o[new_index.w] = value.w; 
-  }  
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cpp
deleted file mode 100644
index 71d4760b98..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeysPtr, cl_mem* &dvaluesPtr, cl_mem* &dkeys_oPtr, cl_mem* &dvalues_oPtr, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes){
-  
-  size_t block[1] = { SORT_BS };
-  size_t grid[1] = { ((numElems+4*SORT_BS-1)/(4*SORT_BS)) * block[0] };
-
-  unsigned int iterations = 0;
-  while(max_value > 0){
-    max_value >>= BITS;
-    iterations++;
-  }
-
-  cl_int ciErrNum;
-  
-  cl_program sort_program;
-  cl_kernel splitSort;
-  cl_kernel splitRearrange;
-  
-  cl_mem dhisto;
-  cl_mem* original = dkeysPtr;
-
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) calloc( (1<<BITS)*grid[0], sizeof(unsigned int) );
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-
-  dhisto = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (1<<BITS)*((numElems+4*SORT_BS-1)/(4*SORT_BS))*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  
-  free(zeroData);
-  
-  //char compileOptions[256];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  /*  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff,
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );*/ 
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_nvidia/sort.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s)\n", __FILE__); exit(1);
-  }
-  	
-  sort_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(sort_program, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );  
-  
-  /*
-  // Uncomment to get build log from compiler for debugging
-  char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-       	
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-  */
-  
-  splitSort = clCreateKernel(sort_program, "splitSort", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  splitRearrange = clCreateKernel(sort_program, "splitRearrange", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);      
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 0, sizeof(int), &numElems) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 4, sizeof(cl_mem), (void *)&dhisto) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 0, sizeof(int), &numElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 6, sizeof(cl_mem), (void *)&dhisto) );
-
-  for (int i=0; i<iterations; i++){
-  
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 1, sizeof(int), &i) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );    
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitSort, 1, 0,
-                            grid, block, 0, 0, 0) );
-    
-    scanLargeArray(((numElems+4*SORT_BS-1)/(4*SORT_BS))*(1<<BITS), dhisto, clContext, clCommandQueue, clDevice, workItemSizes);
-
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 1, sizeof(int), &i ) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitRearrange, 1, 0,
-                            grid, block, 0, 0, 0) );
-
-    cl_mem* temp = dkeysPtr;
-    dkeysPtr = dkeys_oPtr;
-    dkeys_oPtr = temp;
-
-    temp = dvaluesPtr;
-    dvaluesPtr = dvalues_oPtr;
-    dvalues_oPtr = temp;
-  }
-  
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitSort) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitRearrange) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dkeys_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dvalues_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(dhisto) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseProgram(sort_program) );
-
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.h
deleted file mode 100644
index 2f7113bac9..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/opencl_nvidia/sort.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeys, cl_mem* &dvalues, cl_mem* &dkeys_o, cl_mem* &dvalues_o, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.c b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.c
deleted file mode 100644
index 87f4c0cbec..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "UDTypes.h"
-
-#define max(x,y) ((x<y)?y:x)
-#define min(x,y) ((x>y)?y:x)
-
-#define PI 3.14159265359
-
-float kernel_value_CPU(float v){
-
-  float rValue = 0;
-
-  const float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-  (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-   0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-   0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-   0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-   0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-   0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-   0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-   0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = -num/den;
-
-  return rValue;
-}
-
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT){
-  float v;
-  float cutoff2 = (width*width)/4.0;
-
-  unsigned int size;
-
-  if(width > 0){
-    // compute size of LUT based on kernel width
-    size = (unsigned int)(10000*width);
-
-    // allocate memory
-    (*LUT) = (float*) malloc (size*sizeof(float));
-
-    unsigned int k;
-    for(k=0; k<size; ++k){
-      // compute value to evaluate kernel at
-      // v in the range 0:(_width/2)^2
-      v = (((float)k)/((float)size))*cutoff2;
-
-      // compute kernel value and store
-      (*LUT)[k] = kernel_value_CPU(beta*sqrt(1.0-(v/cutoff2)));
-    }
-    (*sizeLUT) = size;
-  }
-}
-
-float kernel_value_LUT(float v, float* LUT, int sizeLUT, float _1overCutoff2)
-{
-  unsigned int k0;
-  float v0;
-
-  v *= (float)sizeLUT;
-  k0=(unsigned int)(v*_1overCutoff2);
-  v0 = ((float)k0)/_1overCutoff2;
-  return  LUT[k0] + ((v-v0)*(LUT[k0+1]-LUT[k0])/_1overCutoff2);
-}
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity){
-
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int i;
-  for (i=0; i < n; i++){
-    ReconstructionSample pt = sample[i];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0.0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0.0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0.0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* kernel weighting value */
-                  if (params.useLUT){
-        		    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-		          } else {
-		            w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-		          }
-
-                  /* grid data */
-                  gridData[idx].real += (w*pt.real);
-                  gridData[idx].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  sampleDensity[idx] += 1.0;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size, 
-		 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]){
-  unsigned int NxL, NxH;
-  unsigned int NyL, NyH;
-  unsigned int NzL, NzH;
-
-  int nx;
-  int ny;
-  int nz;
-
-  float w;
-  unsigned int idx;
-  unsigned int idx0;
-
-  unsigned int idxZ;
-  unsigned int idxY;
-
-  float Dx2[100];
-  float Dy2[100];
-  float Dz2[100];
-  float *dx2=NULL;
-  float *dy2=NULL;
-  float *dz2=NULL;
-
-  float dy2dz2;
-  float v;
-
-  unsigned int size_x = params.gridSize[0];
-  unsigned int size_y = params.gridSize[1];
-  unsigned int size_z = params.gridSize[2];
-
-  int gridNumElems = size_x*size_y*size_z;
-
-  float cutoff = (float)(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  int pos = 0;
-  int* posArray = (int*) malloc (gridNumElems*sizeof(int));
-  memset(posArray, 0xFF, gridNumElems*sizeof(int));
-  (*indeces) = (int*) malloc (gridNumElems*sizeof(int));
-  (*gridData) = (cmplx*) calloc (gridNumElems,sizeof(cmplx));
-  (*sampleDensity) = (float*) calloc (gridNumElems,sizeof(float));
-
-  if (*gridData == NULL || *sampleDensity == NULL || *indeces == NULL){
-    printf("unable to allocate temporary CPU space\n");
-    exit(1);
-  }
-
-  int i;
-  for (i=0; i < CPUbin_size; i++){
-    ReconstructionSample pt = sample[CPUbin[i]];
-
-    float kx = pt.kX;
-    float ky = pt.kY;
-    float kz = pt.kZ;
-
-    NxL = max((kx - cutoff), 0);
-    NxH = min((kx + cutoff), size_x-1.0);
-
-    NyL = max((ky - cutoff), 0);
-    NyH = min((ky + cutoff), size_y-1.0);
-
-    NzL = max((kz - cutoff), 0);
-    NzH = min((kz + cutoff), size_z-1.0);
-
-    if((pt.real != 0.0 || pt.imag != 0.0) && pt.sdc!=0.0)
-    {
-      for(dz2 = Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        *dz2 = ((kz-nz)*(kz-nz));
-      }
-      for(dx2=Dx2,nx=NxL; nx<=NxH; ++nx,++dx2)
-      {
-        *dx2 = ((kx-nx)*(kx-nx));
-      }
-      for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny,++dy2)
-      {
-        *dy2 = ((ky-ny)*(ky-ny));
-      }
-
-      idxZ = (NzL-1)*size_x*size_y;
-      for(dz2=Dz2, nz=NzL; nz<=NzH; ++nz, ++dz2)
-      {
-        /* linear offset into 3-D matrix to get to zposition */
-        idxZ += size_x*size_y;
-
-        idxY = (NyL-1)*size_x;
-
-        /* loop over x indexes, but only if curent distance is close enough (distance will increase by adding x&y distance) */
-        if((*dz2)<cutoff2)
-        {
-          for(dy2=Dy2, ny=NyL; ny<=NyH; ++ny, ++dy2)
-          {
-            /* linear offset IN ADDITION to idxZ to get to Y position */
-            idxY += size_x;
-
-            dy2dz2=(*dz2)+(*dy2);
-
-            idx0 = idxY + idxZ;
-
-            /* loop over y indexes, but only if curent distance is close enough (distance will increase by adding y distance) */
-            if(dy2dz2<cutoff2)
-            {
-              for(dx2=Dx2, nx=NxL; nx<=NxH; ++nx, ++dx2)
-              {
-                /* value to evaluate kernel at */
-                v = dy2dz2+(*dx2);
-
-                if(v<cutoff2)
-                {
-                  /* kernel weighting value */
-                  if (params.useLUT){
-                    w = kernel_value_LUT(v, LUT, sizeLUT, _1overCutoff2) * pt.sdc;
-                  } else {
-                    w = kernel_value_CPU(beta*sqrt(1.0-(v*_1overCutoff2))) * pt.sdc;
-                  }
-
-                  /* linear index of (x,y,z) point */
-                  idx = nx + idx0;
-
-                  /* grid data */
-                  if(posArray[idx] == -1){
-                    posArray[idx] = pos;
-                    (*indeces)[pos] = idx;
-                    pos++;
-                  }
-
-                  (*gridData)[posArray[idx]].real += (w*pt.real);
-                  (*gridData)[posArray[idx]].imag += (w*pt.imag);
-
-                  /* estimate sample density */
-                  (*sampleDensity)[posArray[idx]] += 1;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  free(posArray);
-  return pos;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.h
deleted file mode 100644
index 1d883f00f7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/CPU_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include "stdio.h"
-#include "UDTypes.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-void calculateLUT(float beta, float width, float** LUT, unsigned int* sizeLUT);
-
-int gridding_Gold(unsigned int n, parameters params, ReconstructionSample* sample, float* LUT, unsigned int sizeLUT, cmplx* gridData, float* sampleDensity);
-
-int gridding_CPU(unsigned int n, parameters params, ReconstructionSample* sample, int* CPUbin, int CPUbin_size,
-                 float* LUT, int sizeLUT, cmplx* gridData[], float* sampleDensity[], int* indeces[]);
-#ifdef __cplusplus
-}
-#endif
-
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/GPU_kernels.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/GPU_kernels.cl
deleted file mode 100644
index 91985ec8e5..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/GPU_kernels.cl
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-#define TILE 64
-#define LOG_TILE 6
-
-typedef struct{
-  __global float2* data;
-  __global float4* loc;
-} sampleArrayStruct;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-__kernel void binning_kernel (unsigned int n, 
-                              __global ReconstructionSample* sample_g, 
-                              __global unsigned int* idxKey_g,
-                              __global unsigned int* idxValue_g, 
-                              __global unsigned int* binCount_g, 
-                              unsigned int binsize, unsigned int gridNumElems){
-  unsigned int key;
-  unsigned int sampleIdx = get_group_id(0)*get_local_size(0) + get_local_id(0); //blockIdx.x*blockDim.x+threadIdx.x;
-  ReconstructionSample pt;
-  unsigned int binIdx;
-  unsigned int count;
-
-  if (sampleIdx < n){
-    pt = sample_g[sampleIdx];
-
-    binIdx = (unsigned int)(pt.kZ)*SIZE_XY_VAL + (unsigned int)(pt.kY)*GRIDSIZE_VAL1 + (unsigned int)(pt.kX);
-    if (binCount_g[binIdx]<binsize){
-      count = atom_add(binCount_g+binIdx, 1);
-      if (count < binsize){
-        key = binIdx;
-      } else {
-        atom_sub(binCount_g+binIdx, 1);
-        key = gridNumElems;
-      }
-    } else {
-      key = gridNumElems;
-    }
-
-    idxKey_g[sampleIdx] = key;
-    idxValue_g[sampleIdx] = sampleIdx;
-  }
-}
-
-__kernel void reorder_kernel(int n, 
-                               __global unsigned int* idxValue_g, 
-                               __global ReconstructionSample* samples_g, 
- //                              sampleArrayStruct sortedSampleSoA_g
-                               
-                               __global float2* dataptr_g,
-                               unsigned int f2_offset
-//                               __global float4* locptr_g
-                               
-                               ){
-  unsigned int index = get_group_id(0)*get_local_size(0) + get_local_id(0);
-  unsigned int old_index;
-  ReconstructionSample pt;
-
-  if (index < n){
-    old_index = idxValue_g[index];
-    pt = samples_g[old_index];
-
-    float2 data = (float2) (pt.real, pt.imag);
-    float4 loc = (float4) (pt.kX, pt.kY, pt.kZ, pt.sdc);
-
- //   sortedSampleSoA_g.data[index] = data;
- //   sortedSampleSoA_g.loc[index] = loc;
-    
-    dataptr_g[index] = data;
-    ((__global float4*)(dataptr_g+f2_offset))[index] = loc;
-    
-  }
-}
-
-float kernel_value(float v){
-
-  float rValue = 0;
-
-  float z = v*v;
-
-  // polynomials taken from http://ccrma.stanford.edu/CCRMA/Courses/422/projects/kbd/kbdwindow.cpp
-  float num = (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z* (z*
-                (z* 0.210580722890567e-22f  + 0.380715242345326e-19f ) +
-                 0.479440257548300e-16f) + 0.435125971262668e-13f ) +
-                 0.300931127112960e-10f) + 0.160224679395361e-7f  ) +
-                 0.654858370096785e-5f)  + 0.202591084143397e-2f  ) +
-                 0.463076284721000e0f)   + 0.754337328948189e2f   ) +
-                 0.830792541809429e4f)   + 0.571661130563785e6f   ) +
-                 0.216415572361227e8f)   + 0.356644482244025e9f   ) +
-                 0.144048298227235e10f);
-
-  float den = (z*(z*(z-0.307646912682801e4f)+0.347626332405882e7f)-0.144048298227235e10f);
-
-  rValue = native_divide(-num,den);
-  //rValue = __fdividef(-num,den);
-
-  return rValue;
-}
-
-__kernel void gridding_GPU (//sampleArrayStruct sortedSampleSoA_g, 
-                               __global float2* dataptr_g,
-                               unsigned int f2_offset,
-//                               __global float4* locptr_g
-
-                              __global unsigned int* binStartAddr_g, 
-                              __global float2* gridData_g, 
-                              __global float* sampleDensity_g, 
-                              float beta){
-  __local float real_s[TILE];
-  __local float imag_s[TILE];
-  __local float kx_s[TILE];
-  __local float ky_s[TILE];
-  __local float kz_s[TILE];
-  __local float sdc_s[TILE];
-
-  const int flatIdx = 
-  get_local_id(2)*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0);
-  //threadIdx.z*blockDim.y*blockDim.x+threadIdx.y*blockDim.x+threadIdx.x;
-
-  // figure out starting point of the tile
-  const int z0 = (4*get_local_size(2))*(get_group_id(1)/(GRIDSIZE_VAL2/get_local_size(1)));
-  const int y0 = get_local_size(1)*(get_group_id(1)%(GRIDSIZE_VAL2/get_local_size(1)));
-  const int x0 = get_group_id(0)*get_local_size(0);
-
-  const int X  = x0+get_local_id(0);
-  const int Y  = y0+get_local_id(1);
-  const int Z  = z0+get_local_id(2);
-  const int Z1 = Z+get_local_size(2);
-  const int Z2 = Z1+get_local_size(2);
-  const int Z3 = Z2+get_local_size(2);
-
-  const int xl = x0-CEIL_CUTOFF_VAL;
-  const int xL = (xl < 0) ? 0 : xl;
-  const int xh = x0+get_local_size(0)+CUTOFF_VAL;
-  const int xH = (xh >= GRIDSIZE_VAL1) ? GRIDSIZE_VAL1-1 : xh;
-
-  const int yl = y0-CEIL_CUTOFF_VAL;
-  const int yL = (yl < 0) ? 0 : yl;
-  const int yh = y0+get_local_size(1)+CUTOFF_VAL;
-  const int yH = (yh >= GRIDSIZE_VAL2) ? GRIDSIZE_VAL2-1 : yh;
-
-  const int zl = z0-CEIL_CUTOFF_VAL;
-  const int zL = (zl < 0) ? 0 : zl;
-  const int zh = z0+(4*get_local_size(2))+CUTOFF_VAL;
-  const int zH = (zh >= GRIDSIZE_VAL3) ? GRIDSIZE_VAL3-1 : zh;
-
-  const int idx = Z*SIZE_XY_VAL + Y*GRIDSIZE_VAL1 + X;
-  const int idx1 = idx+get_local_size(2)*SIZE_XY_VAL;
-  const int idx2 = idx1+get_local_size(2)*SIZE_XY_VAL;
-  const int idx3 = idx2+get_local_size(2)*SIZE_XY_VAL;
-
-  float2 pt = (float2) (0.0f, 0.0f);
-  float density = 0.0f;
-
-  float2 pt1 = (float2) (0.0f, 0.0f);
-  float density1 = 0.0f;  
-
-  float2 pt2 = (float2) (0.0f, 0.0f);
-  float density2 = 0.0f;
-
-  float2 pt3 = (float2) (0.0f, 0.0f);
-  float density3 = 0.0f;
-
-  for (int z = zL; z <= zH; z++){
-    for (int y = yL; y <= yH; y++){
-      __global const unsigned int *addr = binStartAddr_g+z*SIZE_XY_VAL+ y*GRIDSIZE_VAL1;
-      const unsigned int start = *(addr+xL);
-      const unsigned int end   = *(addr+xH+1);
-      const unsigned int delta = end-start;
-      for (int x = 0; x < ((delta+TILE-1)>>LOG_TILE); x++){
-        int tileSize = ((delta-(x<<LOG_TILE)) > TILE) ? TILE : (delta-(x<<LOG_TILE));
-        int globalIdx = flatIdx+(x<<LOG_TILE);
-        barrier(CLK_LOCAL_MEM_FENCE );
-        if(flatIdx < tileSize){
-          //const float2 data = sortedSampleSoA_g.data[start+globalIdx];
-          //const float4 loc  = sortedSampleSoA_g.loc [start+globalIdx];
-          
-          const float2 data = dataptr_g[start+globalIdx];          
-          const float4 loc  = ((__global float4*)(dataptr_g+f2_offset))[start+globalIdx];                       
-
-          real_s[flatIdx] = data.x;
-          imag_s[flatIdx] = data.y;
-          kx_s  [flatIdx] = loc.x;
-          ky_s  [flatIdx] = loc.y;
-          kz_s  [flatIdx] = loc.z;
-          sdc_s [flatIdx] = loc.w;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE );
-
-        for (int j=0; j< tileSize; j++){
-          const float real = real_s[j];
-          const float imag = imag_s[j];
-          const float sdc = sdc_s[j];
-
-          if((real != 0.0f || imag != 0.0f) && sdc != 0.0f){
-            float v0 = (kx_s[j]-X)*(kx_s[j]-X);
-            v0 += (ky_s[j]-Y)*(ky_s[j]-Y);
-
-            const float v = v0 + (kz_s[j]-Z)*(kz_s[j]-Z);
-            if(v<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt.x += w*real;
-              pt.y += w*imag;
-              density += 1.0f;
-            }
-
-            const float v1 = v0 + (kz_s[j]-Z1)*(kz_s[j]-Z1);
-            if(v1<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v1*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt1.x += w*real;
-              pt1.y += w*imag;
-              density1 += 1.0f;
-            }
-
-            const float v2 = v0 + (kz_s[j]-Z2)*(kz_s[j]-Z2);
-            if(v2<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v2*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt2.x += w*real;
-              pt2.y += w*imag;
-              density2 += 1.0f;
-            }
-
-            const float v3 = v0 + (kz_s[j]-Z3)*(kz_s[j]-Z3);
-            if(v3<CUTOFF2_VAL){
-              const float w = kernel_value(beta*sqrt(1.0f-(v3*ONE_OVER_CUTOFF2_VAL))) *sdc;
-              pt3.x += w*real;
-              pt3.y += w*imag;
-              density3 += 1.0f;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  gridData_g[idx] = pt;
-  sampleDensity_g[idx] = density;
-
-  gridData_g[idx1] = pt1;
-  sampleDensity_g[idx1] = density1;
-
-  gridData_g[idx2] = pt2;
-  sampleDensity_g[idx2] = density2;
-
-  gridData_g[idx3] = pt3;
-  sampleDensity_g[idx3] = density3;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/Makefile b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/Makefile
deleted file mode 100644
index 46bafdc413..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=CPU_kernels.o main.o OpenCL_interface.o scanLargeArray.o sort.o OpenCL_common.o
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.cpp
deleted file mode 100644
index 57368eda9a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.h
deleted file mode 100644
index b063d9c696..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.cpp
deleted file mode 100644
index 5eb7d2a420..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "scanLargeArray.h"
-#include "CPU_kernels.h"
-
-#include "sort.h"
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-extern char *oclOverhead;
-
-#define PI 3.14159265359
-
-typedef struct{
-  cl_float2* data;
-  cl_float4* loc;
-} sampleArrayStruct;
-
-// Compare function used for Qsort for CPU computation
-int compare (const void * a, const void * b)
-{
-  return ( *(int*)a - *(int*)b );
-}
-
-/***********************************************************************
- * CUDA_interface is the main function for GPU execution. This
- * implementation uses compact binning to distribute input elements
- * into unit-cubed sized bins. The bins are then visited by GPU
- * threads, where every thread computes the value of one (or small set)
- * of output elements by computing the contributions of elements in 
- * neighboring bins to these output elements.
- *
- * The bins have a limited bin size and everything beyond that bin size
- * is offloaded to the CPU to be computed in parallel with the GPU
- * gridding.
- ***********************************************************************/
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  cl_context clContext,
-  cl_command_queue clCommandQueue, //const cl_device clDevice
-  const cl_device_id clDevice,
-  size_t *workItemSizes
-){
-
-  /* Initializing all variables */
-  int dims[3] = {8,4,2}; //size of a gridding block on the GPU
-  size_t blockSize = workItemSizes[0];
-
-  /* x, y, z dimensions of the output grid (gridData) */
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-  int size_xy = size_y*size_x;
-
-  int gridNumElems = size_x * size_y * size_z;  // Total number of grid points
-
-  float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-
-  float cutoff = float(params.kernelWidth)/2.0; // cutoff radius
-  float cutoff2 = cutoff*cutoff;                // square of cutoff radius
-  float _1overCutoff2 = 1/cutoff2;              // 1 over square of cutoff radius
-
-  // Padding used to align the structure of arrays used for the sorted input elements
-  int npad = 0;
-  if (n % 64 != 0){
-    npad = 64 - (n%64);
-  }
-
-  /* Declarations of host data structures */
-  cmplx* gridData_CPU;
-  float* sampleDensity_CPU;
-  int* indices_CPU;
-
-  /* Declarations of device data structures */
-  cl_int ciErrNum;
-  cl_mem sample_d;    // Device array for original input array
-  cl_mem sortedSample_d;             // Device array of the sorted (into bins) input elements.
-                                            // This array is accessed by sortedSampleSoA_d in a structure
-                                            //   of arrays manner.
-  cl_mem gridData_d;                // Device array for output grid
-  cl_mem sampleDensity_d;            // Device array for output sample density
-  cl_mem idxKey_d;            // Array of bin indeces generated in the binning kernel
-                                            //   and used to sort the input elements into their
-                                            //   corresponding bins
-  cl_mem idxValue_d;          // This array holds the indices of input elements in the
-                                            //   the original array. This array is sorted using the
-                                            //   the idxKey_d array, and once sorted, it is used in
-                                            //   the reorder kernel to move the actual elements into
-                                            //   their corresponding bins.
-  sampleArrayStruct sortedSampleSoA_d;      // Structure of Arrays which holds the sorted input elements.
-                                            //   Uses sortedSample_d as the underlying physical data
-                                            //   structures
-  //cl_mem binCount_d;          // Zero-initialized array which counts the number of elements
-                                            //   put in each bin. Based on this array, we determine which
-                                            //   elements get offloaded to the CPU
-  cl_mem binStartAddr_d;      // Array of start offset of each of the compact bins
-
-  cl_mem *idxValue_dPtr;
-  cl_mem *idxKey_dPtr;
-  
-  cl_program gpu_kernels;
-  cl_kernel binning_kernel;
-  cl_kernel reorder_kernel;
-  cl_kernel gridding_GPU;
-
-  /* Allocating device memory */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  unsigned int *zeroData = NULL, *maxIntData = NULL;
-  
-  size_t sizeZeroData = sizeof(float)* 2 * gridNumElems;
-  if ( n*sizeof(ReconstructionSample) > sizeZeroData) {
-    sizeZeroData = n*sizeof(ReconstructionSample);
-  }    
-  if ( (sizeof(unsigned int) * (gridNumElems+1)) > sizeZeroData) {
-    // Not going to be taken, but included just in case since this is used for multiple variables
-    sizeZeroData = sizeof(unsigned int) * (gridNumElems+1);
-  }
-  if ( (((n+3)/4)*4)*sizeof(unsigned int) > sizeZeroData) {
-    sizeZeroData = (((n+3)/4)*4)*sizeof(unsigned int);
-  }
-  
-  zeroData = (unsigned int *) malloc(sizeZeroData);
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  maxIntData = (unsigned int *) malloc((((n+3)/4)*4)*sizeof(unsigned int));
-  if (maxIntData == NULL) { fprintf(stderr, "Could not allocate dummy memset memory\n"); exit(1); }
-  
-  memset(zeroData, 0, sizeZeroData);
-  memset(maxIntData+n, 0xFF, (((n+3)&~(3))-n)*sizeof(unsigned int));
-
-  sortedSample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (n+npad)*sizeof(ReconstructionSample), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  binStartAddr_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (gridNumElems+1)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sample_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n*sizeof(ReconstructionSample), sample, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  idxKey_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), maxIntData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //Pad to nearest multiple of 4 to 
-  idxValue_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (((n+3)/4)*4)*sizeof(unsigned int), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum); //satisfy a property of the sorting kernel.
-
-  idxKey_dPtr = &idxKey_d;
-  idxValue_dPtr = &idxValue_d;
-  
-  free(maxIntData);
-  
-  pb_SwitchToSubTimer(timers, oclOverhead, pb_TimerID_KERNEL);
-
-  char compileOptions[1024];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f -D CEIL_CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff, ceil(cutoff),
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_nvidia/GPU_kernels.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s) \n", __FILE__); exit(1);
-  }  
-  	
-  gpu_kernels = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(gpu_kernels, 1, &clDevice, compileOptions, NULL, NULL) );
-  
-  /*
-  char *build_log;
-  size_t ret_val_size;
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-  build_log = (char *)malloc(ret_val_size+1);
-  ciErrNum = clGetProgramBuildInfo(gpu_kernels, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  OCL_ERRCK_VAR(ciErrNum);
-       	
-
-  // to be careful, terminate with \0
-  // there's no information in the reference whether the string is 0 terminated or not
-  build_log[ret_val_size] = '\0';
-
-  fprintf(stderr, "%s\n", build_log );
-  */
-  
-  binning_kernel = clCreateKernel(gpu_kernels, "binning_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  reorder_kernel = clCreateKernel(gpu_kernels, "reorder_kernel", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  gridding_GPU = clCreateKernel(gpu_kernels, "gridding_GPU", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  //sortedSampleSoA_d.data = (cl_float2*)(sortedSample_d);
-  //sortedSampleSoA_d.loc = (cl_float4*)(((float*)sortedSample_d)+2*(n+npad));
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 1, sizeof(cl_mem), (void *)&sample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 2, sizeof(cl_mem), (void *)idxKey_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 3, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 4, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 5, sizeof(int), &(params.binsize)) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(binning_kernel, 6, sizeof(unsigned int), &gridNumElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 0, sizeof(unsigned int), &n) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 2, sizeof(cl_mem), (void *)&sample_d) );      
-
-  unsigned int num_float2_offset = (n+npad);
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 3, sizeof(cl_mem), (void *)&sortedSample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 4, sizeof(unsigned int), &num_float2_offset) );
-  
-  size_t block1[1] = { blockSize };
-  size_t grid1[1] = { ((n+blockSize-1)/blockSize)*block1[0] };
-    
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 1: Perform binning. This kernel determines which output bin each input element
-   * goes into. Any excess (beyond binsize) is put in the CPU bin
-   */
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, binning_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  /* STEP 2: Sort the index-value pair generate in the binning kernel */
-  cl_mem dkeys_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  cl_mem dvalues_o = clCreateBuffer(clContext, CL_MEM_READ_WRITE, n*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-    
-  cl_mem *dkeys_oPtr = &dkeys_o;
-  cl_mem *dvalues_oPtr = &dvalues_o;
-  
-  cl_mem *beforePointer = idxKey_dPtr;
-
-  sort(n, gridNumElems+1, idxKey_dPtr, idxValue_dPtr, dkeys_oPtr, dvalues_oPtr, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  /* STEP 3: Reorder the input data, based on the sorted values from Step 2.
-   * this step also involves changing the data from array of structs to a struct
-   * of arrays. Also in this kernel, we populate an array with the starting index
-   * of every output bin features in the input array, based on the sorted indices 
-   * from Step 2.
-   * At the end of this step, we copy the start address and list of input elements
-   * that will be computed on the CPU.
-   */
-  OCL_ERRCK_RETVAL( clSetKernelArg(reorder_kernel, 1, sizeof(cl_mem), (void *)idxValue_dPtr) );
-  
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, reorder_kernel, 1, 0,
-                            grid1, block1, 0, 0, 0) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxKey_dPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sample_d) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  /* STEP 4: In this step we generate the ADD scan of the array of starting indices
-   * of the output bins. The result is an array that contains the starting address of
-   * every output bin.
-   */
-  scanLargeArray(gridNumElems+1, binStartAddr_d, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  // Copy back to the CPU the indices of the input elements that will be processed on the CPU
-  unsigned int cpuStart;    
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, binStartAddr_d, CL_TRUE, 
-                          gridNumElems*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to read
-                          &cpuStart, // Host Source
-                          0, NULL, NULL) );
-
-  int CPUbin_size = int(n)-int(cpuStart);
-
-  int* CPUbin;
-  CPUbin = (int *) malloc(CPUbin_size*sizeof(unsigned int));
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, *idxValue_dPtr, CL_TRUE, 
-                          cpuStart*sizeof(unsigned int), // Offset in bytes
-                          CPUbin_size*sizeof(unsigned int), // Size of data to read
-                          CPUbin, // Host Source
-                          0, NULL, NULL) );
-
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*idxValue_dPtr) );
-
-  /* STEP 5: Perform the binning on the GPU. The results are computed in a gather fashion
-   * where each thread computes the value of one output element by reading the relevant
-   * bins.
-   */
-  gridData_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(cmplx), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-  sampleDensity_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, gridNumElems*sizeof(float), zeroData, &ciErrNum);  OCL_ERRCK_VAR(ciErrNum);
-
-  free(zeroData);
-
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-
-  size_t block2[3] = {dims[0], dims[1], dims[2]};
-  size_t grid2[3] = { (size_x/dims[0]) * block2[0], ((size_y*size_z)/(4*dims[1]*dims[2])) * block2[1], 1 * block2[2] };
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 0, sizeof(cl_mem), (void *)&sortedSample_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 1, sizeof(unsigned int), &num_float2_offset) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 2, sizeof(cl_mem), (void *)&binStartAddr_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 3, sizeof(cl_mem), (void *)&gridData_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 4, sizeof(cl_mem), (void *)&sampleDensity_d) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(gridding_GPU, 5, sizeof(float), &beta) );
-
-
-  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, gridding_GPU, 3, 0,
-                            grid2, block2, 0, 0, 0) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  qsort(CPUbin, CPUbin_size, sizeof(int), compare); //Sorting helps cache locality of input element array
-  int num = gridding_CPU(n, params, sample, CPUbin, CPUbin_size, LUT, sizeLUT, &gridData_CPU, &sampleDensity_CPU, &indices_CPU);
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  /* Copying the results from the Device to the Host */
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, sampleDensity_d, CL_TRUE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(float), // Size of data to write
-                          sampleDensity, // Host Source
-                          0, NULL, NULL) );                          
-                          
-  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, gridData_d, CL_TRUE, 
-                          0, // Offset in bytes
-                          gridNumElems*sizeof(cmplx), // Size of data to write
-                          gridData, // Host Source
-                          0, NULL, NULL) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-
-  /* STEP 6: Computing the contributions of the sample points handled by the Host
-   * and adding those to the GPU results.
-   */
-  for (int i=0; i< num; i++){
-    gridData[indices_CPU[i]].real += gridData_CPU[i].real;
-    gridData[indices_CPU[i]].imag += gridData_CPU[i].imag;
-    sampleDensity[indices_CPU[i]] += sampleDensity_CPU[i];
-  }
-
-  if (gridData_CPU != NULL){
-    free(indices_CPU);
-    free(gridData_CPU);
-    free(sampleDensity_CPU);
-  }
-
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-  free(CPUbin);
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(gridData_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sampleDensity_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(binStartAddr_d) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(sortedSample_d) );
-
-  pb_SwitchToTimer(timers, pb_TimerID_NONE);
-
-  return;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.h
deleted file mode 100644
index 0d39e9bb1a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/OpenCL_interface.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void OpenCL_interface (
-  struct pb_TimerSet* timers,
-  unsigned int n,       // Number of input elements
-  parameters params,    // Parameter struct which defines output gridSize, cutoff distance, etc.
-  ReconstructionSample* sample, // Array of input elements
-  float* LUT,           // Precomputed LUT table of Kaiser-Bessel function. 
-                          // Used for computation on CPU instead of using the function every time
-  int sizeLUT,          // Size of LUT
-  cmplx* gridData,      // Array of output grid points. Each element has a real and imaginary component
-  float* sampleDensity,  // Array of same size as gridData couting the number of contributions
-                          // to each grid point in the gridData array
-  const cl_context clContext,  // Pointer to OpenCL Context created by Host
-  const cl_command_queue clCommandQueue,
-  const cl_device_id clDevice,
-  size_t *workItemSizes
-);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/UDTypes.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/UDTypes.h
deleted file mode 100644
index 687fb50157..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/UDTypes.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#ifndef _UDTYPES_H__
-#define _UDTYPES_H__
-
-typedef struct{
-  int numSamples;
-  int aquisitionMatrixSize[3];
-  int reconstructionMatrixSize[3];
-  float kMax[3];
-  int gridSize[3];
-  float oversample;
-  float kernelWidth;
-  int binsize;
-  int useLUT;
-}parameters;
-
-typedef struct{
-  float real;
-  float imag;
-  float kX;
-  float kY;
-  float kZ;
-  float sdc;
-} ReconstructionSample;
-
-typedef struct{
-  float real;
-  float imag;
-} cmplx;
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/main.cpp
deleted file mode 100644
index a30172fd61..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/main.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <CL/cl.h>
-#include "parboil.h"
-
-#include "UDTypes.h"
-#include "OpenCL_interface.h"
-#include "OpenCL_common.h"
-#include "CPU_kernels.h"
-
-#define PI 3.14159265
-
-char *oclOverhead = "OpenCL Overhead";
-
-/************************************************************ 
- * This function reads the parameters from the file provided
- * as a comman line argument.
- ************************************************************/
-void setParameters(FILE* file, parameters* p){
-  fscanf(file,"aquisition.numsamples=%d\n",&(p->numSamples));
-  fscanf(file,"aquisition.kmax=%f %f %f\n",&(p->kMax[0]), &(p->kMax[1]), &(p->kMax[2]));
-  fscanf(file,"aquisition.matrixSize=%d %d %d\n", &(p->aquisitionMatrixSize[0]), &(p->aquisitionMatrixSize[1]), &(p->aquisitionMatrixSize[2]));
-  fscanf(file,"reconstruction.matrixSize=%d %d %d\n", &(p->reconstructionMatrixSize[0]), &(p->reconstructionMatrixSize[1]), &(p->reconstructionMatrixSize[2]));
-  fscanf(file,"gridding.matrixSize=%d %d %d\n", &(p->gridSize[0]), &(p->gridSize[1]), &(p->gridSize[2]));
-  fscanf(file,"gridding.oversampling=%f\n", &(p->oversample));
-  fscanf(file,"kernel.width=%f\n", &(p->kernelWidth));
-  fscanf(file,"kernel.useLUT=%d\n", &(p->useLUT));
-
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-  cl_device_id clDevice;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-  if (deviceFound < 0) {
-    fprintf(stderr, "No suitable device was found\n");
-    exit(1);
-  }
-  cl_ulong mem_size;
-  clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
-
-  printf("  Number of samples = %d\n", p->numSamples);
-  printf("  Total amount of GPU memory: %llu bytes\n", (unsigned long long) mem_size);
-  if (p->numSamples > 10000000 && mem_size/1024/1024 < 3000) {
-    printf("  Need at least 3GB of GPU memory for large dataset\n");
-    exit(1);
-  }
-  printf("  Grid Size = %dx%dx%d\n", p->gridSize[0], p->gridSize[1], p->gridSize[2]);
-  printf("  Input Matrix Size = %dx%dx%d\n", p->aquisitionMatrixSize[0], p->aquisitionMatrixSize[1], p->aquisitionMatrixSize[2]);
-  printf("  Recon Matrix Size = %dx%dx%d\n", p->reconstructionMatrixSize[0], p->reconstructionMatrixSize[1], p->reconstructionMatrixSize[2]);
-  printf("  Kernel Width = %f\n", p->kernelWidth);
-  printf("  KMax = %.2f %.2f %.2f\n", p->kMax[0], p->kMax[1], p->kMax[2]);
-  printf("  Oversampling = %f\n", p->oversample);
-  printf("  GPU Binsize = %d\n", p->binsize);
-  printf("  Use LUT = %s\n", (p->useLUT)?"Yes":"No");
-}
-
-/************************************************************ 
- * This function reads the sample point data from the kspace
- * and klocation files (and sdc file if provided) into the
- * sample array.
- * Returns the number of samples read successfully.
- ************************************************************/
-unsigned int readSampleData(parameters params, FILE* uksdata_f, ReconstructionSample* samples){
-  unsigned int i;
-
-  for(i=0; i<params.numSamples; i++){
-    if (feof(uksdata_f)){
-      break;
-    }
-    fread((void*) &(samples[i]), sizeof(ReconstructionSample), 1, uksdata_f);
-  }
-
-  float kScale[3];
-  kScale[0] = float(params.aquisitionMatrixSize[0])/(float(params.reconstructionMatrixSize[0])*float(params.kMax[0]));
-  kScale[1] = float(params.aquisitionMatrixSize[1])/(float(params.reconstructionMatrixSize[1])*float(params.kMax[1]));
-  kScale[2] = float(params.aquisitionMatrixSize[2])/(float(params.reconstructionMatrixSize[2])*float(params.kMax[2]));
-
-  int size_x = params.gridSize[0];
-  int size_y = params.gridSize[1];
-  int size_z = params.gridSize[2];
-
-  float ax = (kScale[0]*(size_x-1))/2.0;
-  float bx = (float)(size_x-1)/2.0;
-
-  float ay = (kScale[1]*(size_y-1))/2.0;
-  float by = (float)(size_y-1)/2.0;
-
-  float az = (kScale[2]*(size_z-1))/2.0;
-  float bz = (float)(size_z-1)/2.0;
-
-  for(int n=0; n<i; n++){
-    samples[n].kX = floor((samples[n].kX*ax)+bx);
-    samples[n].kY = floor((samples[n].kY*ay)+by);
-    samples[n].kZ = floor((samples[n].kZ*az)+bz);
-  }
-
-  return i;
-}
-
-
-int main (int argc, char* argv[]){
-  struct pb_Parameters* prms;
-  struct pb_TimerSet timers;
-
-  prms = pb_ReadParameters(&argc,argv);
-  pb_InitializeTimerSet(&timers);
-  
-  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  char uksdata[250];
-  parameters params;
-
-  FILE* uksfile_f = NULL;
-  FILE* uksdata_f = NULL;
-
-  strcpy(uksdata,prms->inpFiles[0]);
-  strcat(uksdata,".data");
-
-  uksfile_f = fopen(prms->inpFiles[0],"r");
-  if (uksfile_f == NULL){
-    printf("ERROR: Could not open %s\n",prms->inpFiles[0]);
-    exit(1);
-  }
-
-  printf("\nReading parameters\n");
-
-  if (argc >= 2){
-    params.binsize = atoi(argv[1]);
-  } else { //default binsize value;
-    params.binsize = 128;
-  }
-
-  setParameters(uksfile_f, &params);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  ReconstructionSample* samples; //Input Data
-//  cl_mem samplesPin; 
-  float* LUT; //use look-up table for faster execution on CPU (intermediate data)
-  unsigned int sizeLUT; //set in the function calculateLUT (intermediate data)
-
-  cmplx* gridData; //Output Data
-  float* sampleDensity; //Output Data
-//  cl_mem gridDataPin;
-//  cl_mem sampleDensityPin;
-
-  cmplx* gridData_gold; //Gold Output Data
-  float* sampleDensity_gold; //Gold Output Data
-  
-  cl_int ciErrNum;
-  cl_platform_id clPlatform;
-  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-  cl_device_id clDevice;
-  cl_context clContext;
-
-  int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
-
-  size_t max_alloc_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, 0);
-  size_t global_mem_size = 0;
-  (void) clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &global_mem_size, 0);
-
-  size_t samples_size = params.numSamples*sizeof(ReconstructionSample);
-  int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2];
-  size_t output_size = gridNumElems*sizeof(cmplx);
-
-  if ( (deviceFound < 0) ||
-       ((samples_size+output_size) > global_mem_size) ||
-       (samples_size > max_alloc_size) || 
-       (output_size > max_alloc_size ) ) {
-    fprintf(stderr, "No suitable device was found\n");
-    if(deviceFound >= 0) {
-      fprintf(stderr, "Memory requirements for this dataset exceed device capabilities\n");
-    }
-    exit(1);
-  }
-  
-  cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-  clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  cl_uint workItemDimensions;
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
-  
-  size_t workItemSizes[workItemDimensions];
-  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
-  
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-    
-    /*
-  samplesPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      params.numSamples*sizeof(ReconstructionSample),
-      NULL, &ciErrNum);
-*/
-  samples = (ReconstructionSample *) malloc ( params.numSamples*sizeof(ReconstructionSample) );
-  
-  /*(ReconstructionSample *) clEnqueueMapBuffer(clCommandQueue, samplesPin, CL_TRUE, CL_MAP_WRITE, 0, params.numSamples*sizeof(ReconstructionSample), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-*/
-  if (samples == NULL){
-    printf("ERROR: Unable to allocate and map memory for input data\n");
-    exit(1);
-  }
-
-
-  uksdata_f = fopen(uksdata,"rb");
-
-  if(uksdata_f == NULL){
-    printf("ERROR: Could not open data file\n");
-    exit(1);
-  }
-
-  printf("Reading input data from files\n");
-
-  unsigned int n = readSampleData(params, uksdata_f, samples);
-  fclose(uksdata_f);
-
-  if (params.useLUT){
-    printf("Generating Look-Up Table\n");
-    float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8);
-    calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx));
-  sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float));
-  if (sampleDensity_gold == NULL || gridData_gold == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  printf("Running gold version\n");
-
-  gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold);
-
-  printf("Running OpenCL version\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-/*
-  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, samplesPin, CL_TRUE, 
-                          0, // Offset in bytes
-                          n*sizeof(ReconstructionSample), // Size of data to write
-                          samples, // Host Source
-  
-                          0, NULL, NULL) );*/
- // OCL_ERRCK_RETVAL ( clFinish(clCommandQueue) );
- 
- /*
-  gridDataPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(cmplx), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  gridData = (cmplx *) malloc ( gridNumElems*sizeof(cmplx) );
-  if (gridData == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(cmplx *) clEnqueueMapBuffer(clCommandQueue, gridDataPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(cmplx), 0, NULL, NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  /*
-  sampleDensityPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, 
-      gridNumElems*sizeof(float), NULL, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  */
-  
-  sampleDensity = (float *) malloc ( gridNumElems*sizeof(float) );
-  if (sampleDensity == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-  
-  /*(float *) clEnqueueMapBuffer(clCommandQueue, sampleDensityPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(float), 0, NULL, NULL, &ciErrNum);
-  */
-  
-  OCL_ERRCK_VAR(ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  
-  if (sampleDensity == NULL || gridData == NULL){
-    printf("ERROR: Unable to allocate memory for output data\n");
-    exit(1);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  
-  //Interface function to GPU implementation of gridding
-  OpenCL_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity, clContext, clCommandQueue, clDevice, workItemSizes);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  int passed=1;
-  for (int i=0; i<gridNumElems; i++){
-    if(sampleDensity[i] != sampleDensity_gold[i]) {
-      passed=0;
-      break;
-    }
-  }
-  //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  FILE* outfile;
-  if(!(outfile=fopen(prms->outFile,"w")))
-  {
-        printf("Cannot open output file!\n");
-  } else {
-        fwrite(&passed,sizeof(int),1,outfile);
-        fclose(outfile);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  if (params.useLUT){
-    free(LUT);
-  }
-  
-  /*
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, samplesPin, samples, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, gridDataPin, gridData, 0, NULL, NULL) );
-  OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, sampleDensityPin, sampleDensity, 0, NULL, NULL) );
-  
-  clReleaseMemObject(samplesPin);
-  clReleaseMemObject(gridDataPin);
-  clReleaseMemObject(sampleDensityPin);
-  */
-  
-  free(samples);
-  free(gridData);
-  free(sampleDensity);
-  
-  
-  free(gridData_gold);
-  free(sampleDensity_gold);
-
-  printf("\n");
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(prms);
-
-  return 0;
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cl
deleted file mode 100644
index c45978a38b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cl
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#define BLOCK_SIZE 1024
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
-#define LNB LOG_NUM_BANKS
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernels
-////////////////////////////////////////////////////////////////////////////////
-__kernel void scan_L1_kernel(unsigned int n, __global unsigned int* dataBase, unsigned int data_offset, __global unsigned int* interBase, unsigned int inter_offset)
-{
-    __local unsigned int s_data[EXPANDED_SIZE(BLOCK_SIZE)]; 
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-
-    unsigned int thid = get_local_id(0);
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    unsigned int s_ai = thid;
-    unsigned int s_bi = thid + get_local_size(0);
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = (g_ai < n) ? data[g_ai] : 0;
-    s_data[s_bi] = (g_bi < n) ? data[g_bi] : 0;
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        s_data[bi] += s_data[ai];
-      }
-
-        stride *= 2;
-    }
-
-    if (thid == 0) {
-      unsigned int last = get_local_size(0)*2 -1;
-      last += CONFLICT_FREE_OFFSET(last);
-      inter[get_group_id(0)] = s_data[last];
-      s_data[last] = 0;
-    }
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    if (g_ai < n) { data[g_ai] = s_data[s_ai]; }
-    if (g_bi < n) { data[g_bi] = s_data[s_bi]; }
-}
-
-
-
-__kernel void scan_inter1_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = 1;
-    for (unsigned int d = get_local_size(0); d > 0; d >>= 1) {
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-        s_data[bi] += s_data[ai];
-      }
-
-      stride *= 2;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-__kernel void scan_inter2_kernel(__global unsigned int* data, unsigned int iter)
-{
-    __local unsigned int s_data[DYN_LOCAL_MEM_SIZE];
-
-    unsigned int thid = get_local_id(0);
-    unsigned int gthid = get_global_id(0);
-    unsigned int gi = 2*iter*gthid;
-    unsigned int g_ai = gi + iter - 1;
-    unsigned int g_bi = g_ai + iter;
-
-    unsigned int s_ai = 2*thid;
-    unsigned int s_bi = 2*thid + 1;
-
-    s_ai += CONFLICT_FREE_OFFSET(s_ai);
-    s_bi += CONFLICT_FREE_OFFSET(s_bi);
-
-    s_data[s_ai] = data[g_ai];
-    s_data[s_bi] = data[g_bi];
-
-    unsigned int stride = get_local_size(0)*2;
-
-    for (unsigned int d = 1; d <= get_local_size(0); d *= 2) {
-      stride >>= 1;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-      if (thid < d) {
-        unsigned int i  = 2*stride*thid;
-        unsigned int ai = i + stride - 1;
-        unsigned int bi = ai + stride;
-
-        ai += CONFLICT_FREE_OFFSET(ai);
-        bi += CONFLICT_FREE_OFFSET(bi);
-
-        unsigned int t  = s_data[ai];
-        s_data[ai] = s_data[bi];
-        s_data[bi] += t;
-      }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    data[g_ai] = s_data[s_ai];
-    data[g_bi] = s_data[s_bi];
-}
-
-
-__kernel void uniformAdd(unsigned int n, __global unsigned int *dataBase, unsigned int data_offset, __global unsigned int *interBase, unsigned int inter_offset)
-{
-    __local unsigned int uni;
-    
-    __global unsigned int *data = dataBase + data_offset;
-    __global unsigned int *inter = interBase + inter_offset;
-       
-    if (get_local_id(0) == 0) { uni = inter[get_group_id(0)]; }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    unsigned int g_ai = get_group_id(0)*2*get_local_size(0) + get_local_id(0);
-    unsigned int g_bi = g_ai + get_local_size(0);
-
-    if (g_ai < n) { data[g_ai] += uni; }
-    if (g_bi < n) { data[g_bi] += uni; }
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cpp
deleted file mode 100644
index 9816308d0c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <CL/cl.h>
-#include "OpenCL_common.h"
-
-#define GRID_SIZE 65535
-#define NUM_BANKS 16
-#define LOG_NUM_BANKS 4
-
-#define EXPANDED_SIZE(__x) (__x+(__x>>LOG_NUM_BANKS)+(__x>>(2*LOG_NUM_BANKS)))
-
-void scanLargeArray( unsigned int gridNumElems, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes) {
-
-    size_t blockSize = (workItemSizes[0]*2 < 1024) ? workItemSizes[0]*2 : 1024;
-
-    // Run the prescan
-    unsigned int size = (gridNumElems+blockSize-1)/blockSize;
-    
-    unsigned int dim_block;
-    unsigned int current_max = size*blockSize;
-    for (int block_size_lcv = 128; block_size_lcv <= blockSize; block_size_lcv *= 2){
-      unsigned int array_size = block_size_lcv;
-      while(array_size < size){
-        array_size *= block_size_lcv;
-      }
-      if (array_size <= current_max){
-        current_max = array_size;
-        dim_block = block_size_lcv;
-      }
-    }    
-
-    cl_mem inter_d;
-    cl_int ciErrNum;
-    cl_program scanLargeArray_program;
-
-    cl_kernel scan_L1_kernel;
-    cl_kernel scan_inter1_kernel;
-    cl_kernel scan_inter2_kernel;
-    cl_kernel uniformAdd;
-    
-    // allocate device memory input and output arrays
-    unsigned int *zeroData;
-    zeroData = (unsigned int *)calloc( current_max, sizeof(unsigned int) );
-    if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s)\n", __FILE__); exit(1); }
-
-    inter_d = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, current_max*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-    
-    free(zeroData);
-    
-    char compileOptions[128];
-    //                -cl-nv-verbose // Provides register info for NVIDIA devices
-    // Set all Macros referenced by kernels
-    sprintf(compileOptions, "\
-                -D DYN_LOCAL_MEM_SIZE=%lu",
-                EXPANDED_SIZE(dim_block)
-            );
-  
-    size_t program_length;
-    const char *source_path = "src/opencl_nvidia/scanLargeArray.cl";
-    char *source;
-
-    // Dynamically allocate buffer for source
-    source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-      fprintf(stderr, "Could not load program source! (%s)\n", __FILE__); exit(1);
-    }
-  	
-    scanLargeArray_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-
-    free(source);
-    OCL_ERRCK_RETVAL ( clBuildProgram(scanLargeArray_program, 1, &clDevice, compileOptions, NULL, NULL) ); 
-      
-  /*
-    // Uncomment for build log from compiler for debugging
-    char *build_log;
-    size_t ret_val_size;
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-    build_log = (char *)malloc(ret_val_size+1);
-    ciErrNum = clGetProgramBuildInfo(scanLargeArray_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    // to be carefully, terminate with \0
-    // there's no information in the reference whether the string is 0 terminated or not
-    build_log[ret_val_size] = '\0';
-
-    fprintf(stderr, "%s\n", build_log );
-    */   
-        
-    scan_L1_kernel = clCreateKernel(scanLargeArray_program, "scan_L1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-      
-    scan_inter1_kernel = clCreateKernel(scanLargeArray_program, "scan_inter1_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    scan_inter2_kernel = clCreateKernel(scanLargeArray_program, "scan_inter2_kernel", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);  
-      
-    uniformAdd = clCreateKernel(scanLargeArray_program, "uniformAdd", &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 3, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 0, sizeof(cl_mem), (void *)&inter_d) );
-    
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 1, sizeof(cl_mem), (void *)&data_d) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 3, sizeof(cl_mem), (void *)&inter_d) );
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_L1_kernel, 4, sizeof(unsigned int), &inter_offset) );
-               
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_L1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-    }
-
-    unsigned int stride = 1;
-    for (unsigned int d = current_max; d > 1; d /= dim_block) {        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter1_kernel, 1, sizeof(unsigned int), &stride) );
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter1_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );
-        
-        stride *= dim_block;
-    }
-    
-    unsigned int singleZero = 0;
-    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, inter_d, CL_TRUE, 
-                          (current_max-1)*sizeof(unsigned int), // Offset in bytes
-                          sizeof(unsigned int), // Size of data to write
-                          &singleZero, // Host Source
-                          0, NULL, NULL) );
-
-    for (unsigned int d = dim_block; d <= current_max; d *= dim_block) {
-        stride /= dim_block;
-        
-        size_t block[1] = { dim_block/2 };
-        size_t grid[1] = { (d/dim_block) * block[0] };
-        
-        OCL_ERRCK_RETVAL( clSetKernelArg(scan_inter2_kernel, 1, sizeof(unsigned int), &stride) );
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, scan_inter2_kernel, 1, 0,
-                            grid, block, 0, 0, 0) );                       
-    }
-
-    for (unsigned int i=0; i < (size+GRID_SIZE-1)/GRID_SIZE; i++) {
-        unsigned int gridSize = ((size-(i*GRID_SIZE)) > GRID_SIZE) ? GRID_SIZE : (size-i*GRID_SIZE);
-        unsigned int numElems = ((gridNumElems-(i*GRID_SIZE*blockSize)) > (GRID_SIZE*blockSize)) ? (GRID_SIZE*blockSize) : (gridNumElems-(i*GRID_SIZE*blockSize));
-        
-        unsigned int data_offset = i*GRID_SIZE*blockSize;
-        unsigned int inter_offset = i*GRID_SIZE;
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 0, sizeof(unsigned int), &numElems) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 2, sizeof(unsigned int), &data_offset) );
-        OCL_ERRCK_RETVAL( clSetKernelArg(uniformAdd, 4, sizeof(unsigned int), &inter_offset) );
-        
-        size_t block[1] = { blockSize/2 };
-        size_t grid[1] = { gridSize * block[0] };
-        
-        OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, uniformAdd, 1, 0,
-                            grid, block, 0, 0, 0) ); 
-    }
-
-    OCL_ERRCK_RETVAL ( clReleaseMemObject(inter_d) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_L1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter1_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(scan_inter2_kernel) );
-    OCL_ERRCK_RETVAL ( clReleaseKernel(uniformAdd) );
-
-    OCL_ERRCK_RETVAL ( clReleaseProgram(scanLargeArray_program) );
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.h
deleted file mode 100644
index dc4ff0a04a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/scanLargeArray.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void scanLargeArray( unsigned int gridNumElements, cl_mem data_d, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cl b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cl
deleted file mode 100644
index 2da677119c..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cl
+++ /dev/null
@@ -1,225 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-//#define CONFLICT_FREE_OFFSET(index) ((index) >> LNB + (index) >> (2*LNB))
-#define CONFLICT_FREE_OFFSET(index) (((unsigned int)(index) >> min((unsigned int)(LNB)+(index), (unsigned int)(32-(2*LNB))))>>(2*LNB))
-#define BLOCK_P_OFFSET (4*SORT_BS+1+(4*SORT_BS+1)/16+(4*SORT_BS+1)/64)
-
-void scan (__local unsigned int s_data[BLOCK_P_OFFSET]){
-  unsigned int thid = get_local_id(0);
-
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-
-  unsigned int stride = 2;
-  for (unsigned int d = get_local_size(0); d > 0; d >>= 1)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      s_data[bi] += s_data[ai];
-    }
-
-    stride *= 2;
-  }
-
-  if (thid == 0){
-    unsigned int last = 4*get_local_size(0)-1;
-    last += CONFLICT_FREE_OFFSET(last);
-    s_data[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))] = s_data[last];
-    s_data[last] = 0;
-  }
-
-  for (unsigned int d = 1; d <= get_local_size(0); d *= 2)
-  {
-    stride >>= 1;
-
-    barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-    if (thid < d)
-    {
-      unsigned int i  = 2*stride*thid;
-      unsigned int ai = i + stride - 1;
-      unsigned int bi = ai + stride;
-
-      ai += CONFLICT_FREE_OFFSET(ai);
-      bi += CONFLICT_FREE_OFFSET(bi);
-
-      unsigned int t  = s_data[ai];
-      s_data[ai] = s_data[bi];
-      s_data[bi] += t;
-    }
-  }
-  barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-
-  unsigned int temp = s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)];
-  s_data[2*thid+CONFLICT_FREE_OFFSET(2*thid)] = s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)];
-  s_data[2*thid+1+CONFLICT_FREE_OFFSET(2*thid+1)] += temp;
-
-  unsigned int temp2 = s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))];
-  s_data[2*(get_local_size(0)+thid)+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid))] = s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)];
-  s_data[2*(get_local_size(0)+thid)+1+CONFLICT_FREE_OFFSET(2*(get_local_size(0)+thid)+1)] += temp2;
-
-  barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads();
-}
-
-__kernel void splitSort(int numElems, int iter, 
-                                 __global unsigned int* keys, 
-                                 __global unsigned int* values, 
-                                 __global unsigned int* histo)
-{
-    __local unsigned int flags[BLOCK_P_OFFSET];
-    __local unsigned int histo_s[1<<BITS];
-
-    const unsigned int tid = get_local_id(0);
-    const unsigned int gid = get_group_id(0)*4*SORT_BS+4*get_local_id(0);
-
-    // Copy input to shared mem. Assumes input is always even numbered
-    uint4 lkey = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
-    uint4 lvalue;
-    if (gid < numElems){
-      lkey = *((__global uint4*)(keys+gid));
-      lvalue = *((__global uint4*)(values+gid));
-    }
-
-    if(tid < (1<<BITS)){
-      histo_s[tid] = 0;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-    atom_add(histo_s+((lkey.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-    atom_add(histo_s+((lkey.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter)),1);
-
-    uint4 index = (uint4) (4*tid, 4*tid+1, 4*tid+2, 4*tid+3);
-
-    for (int i=BITS*iter; i<BITS*(iter+1);i++){
-      const uint4 flag = (uint4) ( (lkey.x>>i)&0x1,(lkey.y>>i)&0x1,(lkey.z>>i)&0x1,(lkey.w>>i)&0x1 );
-
-      flags[index.x+CONFLICT_FREE_OFFSET(index.x)] = 1<<(16*flag.x);
-      flags[index.y+CONFLICT_FREE_OFFSET(index.y)] = 1<<(16*flag.y);
-      flags[index.z+CONFLICT_FREE_OFFSET(index.z)] = 1<<(16*flag.z);
-      flags[index.w+CONFLICT_FREE_OFFSET(index.w)] = 1<<(16*flag.w);
-
-      scan (flags);
-
-      index.x = (flags[index.x+CONFLICT_FREE_OFFSET(index.x)]>>(16*flag.x))&0xFFFF;
-      index.y = (flags[index.y+CONFLICT_FREE_OFFSET(index.y)]>>(16*flag.y))&0xFFFF;
-      index.z = (flags[index.z+CONFLICT_FREE_OFFSET(index.z)]>>(16*flag.z))&0xFFFF;
-      index.w = (flags[index.w+CONFLICT_FREE_OFFSET(index.w)]>>(16*flag.w))&0xFFFF;
-
-      unsigned short offset = flags[4*get_local_size(0)+CONFLICT_FREE_OFFSET(4*get_local_size(0))]&0xFFFF;
-      index.x += (flag.x) ? offset : 0;
-      index.y += (flag.y) ? offset : 0;
-      index.z += (flag.z) ? offset : 0;
-      index.w += (flag.w) ? offset : 0;
-
-      barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-    }
-
-    // Write result.
-    if (gid < numElems){
-      keys[get_group_id(0)*4*SORT_BS+index.x] = lkey.x;
-      keys[get_group_id(0)*4*SORT_BS+index.y] = lkey.y;
-      keys[get_group_id(0)*4*SORT_BS+index.z] = lkey.z;
-      keys[get_group_id(0)*4*SORT_BS+index.w] = lkey.w;
-
-      values[get_group_id(0)*4*SORT_BS+index.x] = lvalue.x;
-      values[get_group_id(0)*4*SORT_BS+index.y] = lvalue.y;
-      values[get_group_id(0)*4*SORT_BS+index.z] = lvalue.z;
-      values[get_group_id(0)*4*SORT_BS+index.w] = lvalue.w;
-    }
-    if (tid < (1<<BITS)){
-      histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)] = histo_s[tid];
-    }
-}
-
-__kernel void splitRearrange (int numElems, int iter, 
-                                __global unsigned int* keys_i, 
-                                __global unsigned int* keys_o, 
-                                __global unsigned int* values_i, 
-                                __global unsigned int* values_o, 
-                                __global unsigned int* histo){
-  __local unsigned int histo_s[(1<<BITS)];
-  __local unsigned int array_s[4*SORT_BS];
-  int index = get_group_id(0)*4*SORT_BS + 4*get_local_id(0);
-
-  if (get_local_id(0) < (1<<BITS)){
-    histo_s[get_local_id(0)] = histo[get_num_groups(0)*get_local_id(0)+get_group_id(0)];
-  }
-
-  uint4 mine, value;
-  if (index < numElems){
-    mine = *((__global uint4*)(keys_i+index));
-    value = *((__global uint4*)(values_i+index));
-  } else {
-    mine.x = UINT32_MAX;
-    mine.y = UINT32_MAX;
-    mine.z = UINT32_MAX;
-    mine.w = UINT32_MAX;
-  }
-  
-  uint4 masks = (uint4) ( (mine.x&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.y&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.z&((1<<(BITS*(iter+1)))-1))>>(BITS*iter),
-                 (mine.w&((1<<(BITS*(iter+1)))-1))>>(BITS*iter) );
-
-  ((__local uint4*)array_s)[get_local_id(0)] = masks;
-  barrier(CLK_LOCAL_MEM_FENCE ); //__syncthreads();
-
-  uint4 new_index = (uint4) ( histo_s[masks.x],histo_s[masks.y],histo_s[masks.z],histo_s[masks.w] );
-
-  int i = 4*get_local_id(0)-1;
-  
-  while (i >= 0){
-    if (array_s[i] == masks.x){
-      new_index.x++;
-      i--;
-    } else {
-      break;
-    }
-  }
-
-  new_index.y = (masks.y == masks.x) ? new_index.x+1 : new_index.y;
-  new_index.z = (masks.z == masks.y) ? new_index.y+1 : new_index.z;
-  new_index.w = (masks.w == masks.z) ? new_index.z+1 : new_index.w;
-
-  if (index < numElems){
-    keys_o[new_index.x] = mine.x;
-    values_o[new_index.x] = value.x;
-
-    keys_o[new_index.y] = mine.y;
-    values_o[new_index.y] = value.y;
-
-    keys_o[new_index.z] = mine.z;
-    values_o[new_index.z] = value.z;
-
-    keys_o[new_index.w] = mine.w;
-    values_o[new_index.w] = value.w; 
-  }  
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cpp b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cpp
deleted file mode 100644
index 71d4760b98..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include "scanLargeArray.h"
-#include "OpenCL_common.h"
-
-#define UINT32_MAX 4294967295
-#define BITS 4
-#define LNB 4
-
-#define SORT_BS 256
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeysPtr, cl_mem* &dvaluesPtr, cl_mem* &dkeys_oPtr, cl_mem* &dvalues_oPtr, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes){
-  
-  size_t block[1] = { SORT_BS };
-  size_t grid[1] = { ((numElems+4*SORT_BS-1)/(4*SORT_BS)) * block[0] };
-
-  unsigned int iterations = 0;
-  while(max_value > 0){
-    max_value >>= BITS;
-    iterations++;
-  }
-
-  cl_int ciErrNum;
-  
-  cl_program sort_program;
-  cl_kernel splitSort;
-  cl_kernel splitRearrange;
-  
-  cl_mem dhisto;
-  cl_mem* original = dkeysPtr;
-
-  unsigned int *zeroData;
-  zeroData = (unsigned int *) calloc( (1<<BITS)*grid[0], sizeof(unsigned int) );
-  if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s: %d)\n", __FILE__, __LINE__); exit(1); }
-
-  dhisto = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (1<<BITS)*((numElems+4*SORT_BS-1)/(4*SORT_BS))*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
-  
-  free(zeroData);
-  
-  //char compileOptions[256];
-  //                -cl-nv-verbose // Provides register info for NVIDIA devices
-  // Set all Macros referenced by kernels
-  /*  sprintf(compileOptions, "\
-                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f\
-                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
-                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
-                cutoff2, cutoff,
-                params.gridSize[0], params.gridSize[1], params.gridSize[2],
-                size_xy, _1overCutoff2
-            );*/ 
-  
-  size_t program_length;
-  const char *source_path = "src/opencl_nvidia/sort.cl";
-  char *source;
-
-  // Dynamically allocate buffer for source
-  source = oclLoadProgSource(source_path, "", &program_length);
-  if(!source) {
-    fprintf(stderr, "Could not load program source (%s)\n", __FILE__); exit(1);
-  }
-  	
-  sort_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  free(source);
-  
-  OCL_ERRCK_RETVAL ( clBuildProgram(sort_program, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );  
-  
-  /*
-  // Uncomment to get build log from compiler for debugging
-  char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK_VAR(ciErrNum);
-       	
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-  */
-  
-  splitSort = clCreateKernel(sort_program, "splitSort", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);
-  splitRearrange = clCreateKernel(sort_program, "splitRearrange", &ciErrNum);
-  OCL_ERRCK_VAR(ciErrNum);      
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 0, sizeof(int), &numElems) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 4, sizeof(cl_mem), (void *)&dhisto) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 0, sizeof(int), &numElems) );
-  
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 6, sizeof(cl_mem), (void *)&dhisto) );
-
-  for (int i=0; i<iterations; i++){
-  
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 1, sizeof(int), &i) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );    
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitSort, 1, 0,
-                            grid, block, 0, 0, 0) );
-    
-    scanLargeArray(((numElems+4*SORT_BS-1)/(4*SORT_BS))*(1<<BITS), dhisto, clContext, clCommandQueue, clDevice, workItemSizes);
-
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 1, sizeof(int), &i ) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
-    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
-
-    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitRearrange, 1, 0,
-                            grid, block, 0, 0, 0) );
-
-    cl_mem* temp = dkeysPtr;
-    dkeysPtr = dkeys_oPtr;
-    dkeys_oPtr = temp;
-
-    temp = dvaluesPtr;
-    dvaluesPtr = dvalues_oPtr;
-    dvalues_oPtr = temp;
-  }
-  
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitSort) );
-  OCL_ERRCK_RETVAL ( clReleaseKernel(splitRearrange) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dkeys_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dvalues_oPtr) );
-  OCL_ERRCK_RETVAL ( clReleaseMemObject(dhisto) );
-  
-  OCL_ERRCK_RETVAL ( clReleaseProgram(sort_program) );
-
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.h b/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.h
deleted file mode 100644
index 2f7113bac9..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/src/visc/sort.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
- *
- *            (C) Copyright 2010 The Board of Trustees of the
- *                        University of Illinois
- *                         All Rights Reserved
- *
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-void sort (int numElems, unsigned int max_value, cl_mem* &dkeys, cl_mem* &dvalues, cl_mem* &dkeys_o, cl_mem* &dvalues_o, cl_context clContext, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes);
diff --git a/hpvm/test/parboil/benchmarks/mri-gridding/tools/compare-output b/hpvm/test/parboil/benchmarks/mri-gridding/tools/compare-output
deleted file mode 100755
index 168ab45244..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-gridding/tools/compare-output
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-sys.path.insert(0, '../../common/python')
-import struct
-
-tol_diff = 0.001
-tol_ratio = 0.002
-
-def Exit(b):
-  if b:
-    print "Pass"
-    sys.exit(0)
-  else:
-    print "Mismatch"
-    sys.exit(1)
-
-def Run():
-  try:
-    hx = open(sys.argv[1], 'rb')
-    hy = open(sys.argv[2], 'rb')
-  except:
-    Exit(False)
-
-  # size (int)
-  dx = hx.read(4)
-  dy = hy.read(4)
-
-  ty = struct.unpack("i",dy)[0]
-
-  if ty == 1:
-    Exit(True)
-
-  Exit(False)
-
-#  dx = hx.read(4)
-#  dy = hy.read(4)
-
-#  lx = struct.unpack("i", dx)[0]
-#  ly = struct.unpack("i", dy)[0]
-
-#  data_r = hx.read()
-#  data_c = hy.read()
-
-#  hx.close()
-#  hy.close()
-
-#  if lx != ly:
-#    print "Reference and compare are different in size"
-#    Exit(False)
-#  if len(data_r) != 12 * lx:
-#    print "Reference: sanity check failed"
-#    Exit(False)
-#  if len(data_c) != 12 * ly:
-#    print "Compare: sanity check failed"
-#    Exit(False)
-
-#  for i in range(0, lx, 4):
-#    r = struct.unpack('f', data_r[i:i+4])[0]
-#    c = struct.unpack('f', data_c[i:i+4])[0]
-
-#    diff = abs(r - c)
-#    if not (diff <= tol_diff or diff < tol_ratio * abs(r)):
-#      print r, c, i
-#      Exit(False)
-
-#  Exit(True)
-
-Run()
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/DESCRIPTION b/hpvm/test/parboil/benchmarks/mri-q/DESCRIPTION
deleted file mode 100644
index 5a673b5838..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/DESCRIPTION
+++ /dev/null
@@ -1,5 +0,0 @@
-Computation of a matrix Q, representing the scanner configuration, used in a 3D magnetic resonance image reconstruction algorithm in non-Cartesian space.
-
-See also:
-  Sam S. Stone, Justin P. Haldar, Stephanie C. Tsao, Wen-Mei W. Hwu, Zhi-Pei Liang, and Bradley P. Sutton.  "Accelerating Advanced MRI Reconstructions on GPUs."  In Computing Frontiers, 2008.
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/Makefile b/hpvm/test/parboil/benchmarks/mri-q/Makefile
deleted file mode 100644
index 0d50832387..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-PARBOIL_ROOT = /home/psrivas2/current-test/parboil
-APP = mri-q
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = visc
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = small
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-ifeq ($(TEST),small)
-  INPUT = $(DATASET_DIR)/small/input/32_32_32_dataset.bin
-  REF_OUTPUT = $(DATASET_DIR)/small/output/32_32_32_dataset.out
-  RUNDIR = run/$(VERSION)/small
-  OUTPUT = $(RUNDIR)/32_32_32_dataset.out
-else
-  INPUT = $(DATASET_DIR)/large/input/64_64_64_dataset.bin
-  REF_OUTPUT = $(DATASET_DIR)/large/output/64_64_64_dataset.out
-  RUNDIR = run/$(VERSION)/large
-  OUTPUT = $(RUNDIR)/64_64_64_dataset.out
-endif
-
-ARGS = -i $(INPUT) -o $(OUTPUT)
-TOOL = tools/compare-output
-
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/Makefile b/hpvm/test/parboil/benchmarks/mri-q/recycle/base/Makefile
deleted file mode 100644
index 4dfb7fb473..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=main.o file.o
-APP_CFLAGS=-ffast-math
-APP_CXXFLAGS=-ffast-math
-APP_LDFLAGS=-lm -lstdc++
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/computeQ.cc b/hpvm/test/parboil/benchmarks/mri-q/recycle/base/computeQ.cc
deleted file mode 100644
index 3cf9919a47..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/computeQ.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-inline
-void 
-ComputePhiMagCPU(int numK, 
-                 float* phiR, float* phiI, float* phiMag) {
-  int indexK = 0;
-  for (indexK = 0; indexK < numK; indexK++) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-inline
-void
-ComputeQCPU(int numK, int numX,
-            struct kValues *kVals,
-            float* x, float* y, float* z,
-            float *Qr, float *Qi) {
-  float expArg;
-  float cosArg;
-  float sinArg;
-
-  int indexK, indexX;
-  for (indexK = 0; indexK < numK; indexK++) {
-    for (indexX = 0; indexX < numX; indexX++) {
-      expArg = PIx2 * (kVals[indexK].Kx * x[indexX] +
-                       kVals[indexK].Ky * y[indexX] +
-                       kVals[indexK].Kz * z[indexX]);
-
-      cosArg = cosf(expArg);
-      sinArg = sinf(expArg);
-
-      float phi = kVals[indexK].PhiMag;
-      Qr[indexX] += phi * cosArg;
-      Qi[indexX] += phi * sinArg;
-    }
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qr, 0, numX * sizeof(float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qi, 0, numX * sizeof(float));
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.cc b/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.cc
deleted file mode 100644
index 3463e759bf..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.h b/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/main.c b/hpvm/test/parboil/benchmarks/mri-q/recycle/base/main.c
deleted file mode 100644
index 6c7de560f1..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/recycle/base/main.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-
-#include <parboil.h>
-
-#include "file.h"
-#include "computeQ.cc"
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  ComputePhiMagCPU(numK, phiR, phiI, phiMag);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-  int k;
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-  ComputeQCPU(numK, numX, kVals, x, y, z, Qr, Qi);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (phiMag);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/cpu/Makefile
deleted file mode 100644
index 57dcaf9c34..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=main.o file.o
-APP_CFLAGS=-ffast-math
-APP_CXXFLAGS=-ffast-math
-APP_LDFLAGS=-lm -lstdc++
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/computeQ.cc b/hpvm/test/parboil/benchmarks/mri-q/src/cpu/computeQ.cc
deleted file mode 100644
index 3cf9919a47..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/computeQ.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-inline
-void 
-ComputePhiMagCPU(int numK, 
-                 float* phiR, float* phiI, float* phiMag) {
-  int indexK = 0;
-  for (indexK = 0; indexK < numK; indexK++) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-inline
-void
-ComputeQCPU(int numK, int numX,
-            struct kValues *kVals,
-            float* x, float* y, float* z,
-            float *Qr, float *Qi) {
-  float expArg;
-  float cosArg;
-  float sinArg;
-
-  int indexK, indexX;
-  for (indexK = 0; indexK < numK; indexK++) {
-    for (indexX = 0; indexX < numX; indexX++) {
-      expArg = PIx2 * (kVals[indexK].Kx * x[indexX] +
-                       kVals[indexK].Ky * y[indexX] +
-                       kVals[indexK].Kz * z[indexX]);
-
-      cosArg = cosf(expArg);
-      sinArg = sinf(expArg);
-
-      float phi = kVals[indexK].PhiMag;
-      Qr[indexX] += phi * cosArg;
-      Qi[indexX] += phi * sinArg;
-    }
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qr, 0, numX * sizeof(float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qi, 0, numX * sizeof(float));
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.cc b/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.cc
deleted file mode 100644
index 3463e759bf..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.h b/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/main.c b/hpvm/test/parboil/benchmarks/mri-q/src/cpu/main.c
deleted file mode 100644
index 6c7de560f1..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cpu/main.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-
-#include <parboil.h>
-
-#include "file.h"
-#include "computeQ.cc"
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  ComputePhiMagCPU(numK, phiR, phiI, phiMag);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-  int k;
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-  ComputeQCPU(numK, numX, kVals, x, y, z, Qr, Qi);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (phiMag);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/cuda/Makefile
deleted file mode 100644
index edd50283a8..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-
-SRCDIR_OBJS=file.o main.o
-APP_LDFLAGS=-lm -lstdc++
-APP_CUDACFLAGS=--use_fast_math
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/computeQ.cu b/hpvm/test/parboil/benchmarks/mri-q/src/cuda/computeQ.cu
deleted file mode 100644
index 6d1567e008..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/computeQ.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 512
-#define KERNEL_Q_THREADS_PER_BLOCK 256
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-
-#define CUDA_ERRCK							\
-  {cudaError_t err;							\
-    if ((err = cudaGetLastError()) != cudaSuccess) {			\
-      fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, cudaGetErrorString(err)); \
-      exit(-1);								\
-    }									\
-  }
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-/* Values in the k-space coordinate system are stored in constant memory
- * on the GPU */
-__constant__ __device__ kValues ck[KERNEL_Q_K_ELEMS_PER_GRID];
-
-__global__ void
-ComputePhiMag_GPU(float* phiR, float* phiI, float* phiMag, int numK) {
-  int indexK = blockIdx.x*KERNEL_PHI_MAG_THREADS_PER_BLOCK + threadIdx.x;
-  if (indexK < numK) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-__global__ void
-ComputeQ_GPU(int numK, int kGlobalIndex,
-	     float* x, float* y, float* z, float* Qr , float* Qi)
-{
-  float sX;
-  float sY;
-  float sZ;
-  float sQr;
-  float sQi;
-
-  // Determine the element of the X arrays computed by this thread
-  int xIndex = blockIdx.x*KERNEL_Q_THREADS_PER_BLOCK + threadIdx.x;
-
-  // Read block's X values from global mem to shared mem
-  sX = x[xIndex];
-  sY = y[xIndex];
-  sZ = z[xIndex];
-  sQr = Qr[xIndex];
-  sQi = Qi[xIndex];
-
-  // Loop over all elements of K in constant mem to compute a partial value
-  // for X.
-  int kIndex = 0;
-  if (numK % 2) {
-    float expArg = PIx2 * (ck[0].Kx * sX + ck[0].Ky * sY + ck[0].Kz * sZ);
-    sQr += ck[0].PhiMag * cos(expArg);
-    sQi += ck[0].PhiMag * sin(expArg);
-    kIndex++;
-    kGlobalIndex++;
-  }
-
-  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
-       kIndex += 2, kGlobalIndex += 2) {
-    float expArg = PIx2 * (ck[kIndex].Kx * sX +
-			   ck[kIndex].Ky * sY +
-			   ck[kIndex].Kz * sZ);
-    sQr += ck[kIndex].PhiMag * cos(expArg);
-    sQi += ck[kIndex].PhiMag * sin(expArg);
-
-    int kIndex1 = kIndex + 1;
-    float expArg1 = PIx2 * (ck[kIndex1].Kx * sX +
-			    ck[kIndex1].Ky * sY +
-			    ck[kIndex1].Kz * sZ);
-    sQr += ck[kIndex1].PhiMag * cos(expArg1);
-    sQi += ck[kIndex1].PhiMag * sin(expArg1);
-  }
-
-  Qr[xIndex] = sQr;
-  Qi[xIndex] = sQi;
-}
-
-void computePhiMag_GPU(int numK, float* phiR_d, float* phiI_d, float* phiMag_d)
-{
-  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
-    phiMagBlocks++;
-  dim3 DimPhiMagBlock(KERNEL_PHI_MAG_THREADS_PER_BLOCK, 1);
-  dim3 DimPhiMagGrid(phiMagBlocks, 1);
-
-  ComputePhiMag_GPU <<< DimPhiMagGrid, DimPhiMagBlock >>> 
-    (phiR_d, phiI_d, phiMag_d, numK);
-}
-
-void computeQ_GPU(int numK, int numX,
-                  float* x_d, float* y_d, float* z_d,
-                  kValues* kVals,
-                  float* Qr_d, float* Qi_d)
-{
-  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
-  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
-    QGrids++;
-  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
-  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
-    QBlocks++;
-  dim3 DimQBlock(KERNEL_Q_THREADS_PER_BLOCK, 1);
-  dim3 DimQGrid(QBlocks, 1);
-
-  for (int QGrid = 0; QGrid < QGrids; QGrid++) {
-    // Put the tile of K values into constant mem
-    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
-    kValues* kValsTile = kVals + QGridBase;
-    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
-
-    cudaMemcpyToSymbol(ck, kValsTile, numElems * sizeof(kValues), 0);
-
-    ComputeQ_GPU <<< DimQGrid, DimQBlock >>>
-      (numK, QGridBase, x_d, y_d, z_d, Qr_d, Qi_d);
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.cc b/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.cc
deleted file mode 100644
index 3463e759bf..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.h b/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/main.cu b/hpvm/test/parboil/benchmarks/mri-q/src/cuda/main.cu
deleted file mode 100644
index 094c496f77..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/cuda/main.cu
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <malloc.h>
-
-#include <parboil.h>
-
-#include "file.h"
-#include "computeQ.cu"
-
-static void
-setupMemoryGPU(int num, int size, float*& dev_ptr, float*& host_ptr)
-{
-  cudaMalloc ((void **) &dev_ptr, num * size);
-  CUDA_ERRCK;
-  cudaMemcpy (dev_ptr, host_ptr, num * size, cudaMemcpyHostToDevice);
-  CUDA_ERRCK;
-}
-
-static void
-cleanupMemoryGPU(int num, int size, float *& dev_ptr, float * host_ptr)
-{
-  cudaMemcpy (host_ptr, dev_ptr, num * size, cudaMemcpyDeviceToHost);
-  CUDA_ERRCK;
-  cudaFree(dev_ptr);
-  CUDA_ERRCK;
-}
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  /* GPU section 1 (precompute PhiMag) */
-  {
-    /* Mirror several data structures on the device */
-    float *phiR_d, *phiI_d;
-    float *phiMag_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    setupMemoryGPU(numK, sizeof(float), phiR_d, phiR);
-    setupMemoryGPU(numK, sizeof(float), phiI_d, phiI);
-    cudaMalloc((void **)&phiMag_d, numK * sizeof(float));
-    CUDA_ERRCK;
-
-    cudaThreadSynchronize();
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d);
-
-    cudaThreadSynchronize();
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    cleanupMemoryGPU(numK, sizeof(float), phiMag_d, phiMag);
-    cudaFree(phiR_d);
-    cudaFree(phiI_d);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-  for (int k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-
-  free(phiMag);
-
-  /* GPU section 2 */
-  {
-    float *x_d, *y_d, *z_d;
-    float *Qr_d, *Qi_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    setupMemoryGPU(numX, sizeof(float), x_d, x);
-    setupMemoryGPU(numX, sizeof(float), y_d, y);
-    setupMemoryGPU(numX, sizeof(float), z_d, z);
-    cudaMalloc((void **)&Qr_d, numX * sizeof(float));
-    CUDA_ERRCK;
-    cudaMemset((void *)Qr_d, 0, numX * sizeof(float));
-    cudaMalloc((void **)&Qi_d, numX * sizeof(float));
-    CUDA_ERRCK;
-    cudaMemset((void *)Qi_d, 0, numX * sizeof(float));
-
-    cudaThreadSynchronize();
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d);
-
-    cudaThreadSynchronize();
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    cudaFree(x_d);
-    cudaFree(y_d);
-    cudaFree(z_d);
-    cleanupMemoryGPU(numX, sizeof(float), Qr_d, Qr);
-    cleanupMemoryGPU(numX, sizeof(float), Qi_d, Qi);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/Makefile
deleted file mode 100644
index 27d6359f31..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=main.o file.o
-APP_CFLAGS=-ffast-math -fopenmp
-APP_CXXFLAGS=-ffast-math -fopenmp
-APP_LDFLAGS=-lm -lstdc++ -lgomp
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/computeQ.cc b/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/computeQ.cc
deleted file mode 100644
index 295452a3ce..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/computeQ.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-inline
-void 
-ComputePhiMagCPU(int numK, 
-                 float* phiR, float* phiI, float* phiMag) {
-  int indexK = 0;
-  #pragma omp parallel for
-  for (indexK = 0; indexK < numK; indexK++) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-inline
-void
-ComputeQCPU(int numK, int numX,
-            struct kValues *kVals,
-            float* x, float* y, float* z,
-            float *Qr, float *Qi) {
-  float expArg;
-  float cosArg;
-  float sinArg;
-
-  int indexK, indexX;
-  #pragma omp paralel for
-  for (indexK = 0; indexK < numK; indexK++) {
-    for (indexX = 0; indexX < numX; indexX++) {
-      expArg = PIx2 * (kVals[indexK].Kx * x[indexX] +
-                       kVals[indexK].Ky * y[indexX] +
-                       kVals[indexK].Kz * z[indexX]);
-
-      cosArg = cosf(expArg);
-      sinArg = sinf(expArg);
-
-      float phi = kVals[indexK].PhiMag;
-      Qr[indexX] += phi * cosArg;
-      Qi[indexX] += phi * sinArg;
-    }
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qr, 0, numX * sizeof(float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-  memset((void *)*Qi, 0, numX * sizeof(float));
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.cc b/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.cc
deleted file mode 100644
index 3463e759bf..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.h b/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/main.c
deleted file mode 100644
index 6b8f6823d8..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/omp_base/main.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-
-#include <parboil.h>
-
-#include "file.h"
-#include "computeQ.cc"
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  ComputePhiMagCPU(numK, phiR, phiI, phiMag);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-  int k;
-  #pragma omp parallel for
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-  ComputeQCPU(numK, numX, kVals, x, y, z, Qr, Qi);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (phiMag);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/Makefile
deleted file mode 100644
index d896069f4b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o file.o computeQ.o ocl.o
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -g3 -O3
-APP_CXXFLAGS=-ffast-math -g3 -O3
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.c
deleted file mode 100644
index cf38fe4089..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <malloc.h>
-#include <CL/cl.h>
-#include "ocl.h"
-#include "macros.h"
-#include "computeQ.h"
-
-#define NC 4
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm)
-{
-  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
-    phiMagBlocks++;
-  
-  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-
-  cl_int clStatus;
-  clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(cl_mem),&phiR_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(cl_mem),&phiI_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&phiMag_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(int),&numK);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimPhiMagGrid,&DimPhiMagBlock,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-}
-
-static
-unsigned long long int
-readElapsedTime(cl_event internal)
-{
-  cl_int status;
-  cl_ulong t_begin, t_end;
-  status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_START,
-    sizeof(cl_ulong), &t_begin, NULL);
-  if (status != CL_SUCCESS) return 0;
-  status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_END,
-  sizeof(cl_ulong), &t_end, NULL);
-  if (status != CL_SUCCESS) return 0;
-  return (unsigned long long int)(t_end - t_begin);
-}
-
-
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm)
-{
-  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
-  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
-    QGrids++;
-  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
-  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
-    QBlocks++;
-
-  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
-  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
-
-  cl_int clStatus;
-  cl_mem ck;
-  ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
-
-  int QGrid;
-  for (QGrid = 0; QGrid < QGrids; QGrid++) {
-    printf("Kernel Q call %d\n", QGrid);
-    // Put the tile of K values into constant mem
-    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
-    struct kValues* kValsTile = kVals + QGridBase;
-    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
-
-    clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    
-    clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
-    clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
-    clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
-    CHECK_ERROR("clSetKernelArg")
-
-    printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
-
-    #define TIMED_EXECUTION
-    #ifdef TIMED_EXECUTION
-    cl_event e;
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,&e);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-    clWaitForEvents(1, &e);
-    printf ("%llu\n", readElapsedTime(e));
-    #else
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-    #endif
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.h
deleted file mode 100644
index ec9192201a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/computeQ.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __COMPUTEQ__
-#define __COMPUTEQ__
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm);
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 		  float** Qr, float** Qi);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.cc b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.cc
deleted file mode 100644
index cbfdb32937..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "file.h"
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/kernels.cl b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/kernels.cl
deleted file mode 100644
index 9c66c69124..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/kernels.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "macros.h"
-
-#define NC  4
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-#define COARSE_GENERAL
-// #define COARSE_SPEC NC
-
-__kernel void
-ComputePhiMag_GPU(__global float* phiR, __global float* phiI, __global float* phiMag, int numK) {
-  int indexK = get_global_id(0);
-  if (indexK < numK) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-__kernel void
-ComputeQ_GPU(int numK, int kGlobalIndex,
-	     __global float* x, __global float* y, __global float* z,
-	     __global float* Qr, __global float* Qi, __global struct kValues* ck) 
-{
-
-  float sX[NC];
-  float sY[NC];
-  float sZ[NC];
-  float sQr[NC];
-  float sQi[NC];
-
-  #pragma unroll
-  for (int tx = 0; tx < NC; tx++) {
-    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
-
-    sX[tx] = x[xIndex];
-    sY[tx] = y[xIndex];
-    sZ[tx] = z[xIndex];
-    sQr[tx] = Qr[xIndex];
-    sQi[tx] = Qi[xIndex];
-  }
-
-  // Loop over all elements of K in constant mem to compute a partial value
-  // for X.
-  int kIndex = 0;
-  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
-       kIndex ++, kGlobalIndex ++) {
-    float kx = ck[kIndex].Kx;
-    float ky = ck[kIndex].Ky;
-    float kz = ck[kIndex].Kz;
-    float pm = ck[kIndex].PhiMag;
-
-    #pragma unroll
-    for (int tx = 0; tx < NC; tx++) {
-      float expArg = PIx2 *
-                   (kx * sX[tx] +
-                    ky * sY[tx] +
-                    kz * sZ[tx]);
-      sQr[tx] += pm * cos(expArg);
-      sQi[tx] += pm * sin(expArg);
-    }
-  }
-
-  #pragma unroll
-  for (int tx = 0; tx < NC; tx++) {
-    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
-    Qr[xIndex] = sQr[tx];
-    Qi[xIndex] = sQi[tx];
-  }
-
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/macros.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/macros.h
deleted file mode 100644
index d844cabfdd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/macros.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//#ifndef __MACROS__
-//#define __MACROS__
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 256 /* 512 */
-#define KERNEL_Q_THREADS_PER_BLOCK 256
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-//#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/main.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/main.c
deleted file mode 100644
index 452762267e..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/main.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <sys/time.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "ocl.h"
-#include "file.h"
-#include "macros.h"
-#include "computeQ.h"
-
-static void
-setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  *dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer");
-  clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnequeueWriteBuffer");
-}
-
-static void
-cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-  clStatus = clReleaseMemObject(*dev_ptr);
-  CHECK_ERROR("clReleaseMemObject")
-}
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  clPrmtr clPrm;
-
-  cl_int clStatus;
-  cl_platform_id cpPlatform;
-  clStatus = clGetPlatformIDs(1,&cpPlatform,NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)cpPlatform,0};
-
-  cl_device_id cdDevice;
-  clStatus = clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&cdDevice,NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  clPrm.clContext = clCreateContextFromType(cps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,cdDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
-
-  const char* clSource[] = {readFile("src/opencl/kernels.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char options[50];
-  sprintf(options,"-I src/opencl");
-  clStatus = clBuildProgram(clProgram,0,NULL,options,NULL,NULL);
-  if (clStatus != CL_SUCCESS) {
-    char buf[4096];
-    clGetProgramBuildInfo(clProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 4096, buf, NULL);
-    printf ("%s\n", buf);
-    CHECK_ERROR("clBuildProgram")
-  }
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  /* GPU section 1 (precompute PhiMag) */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")    
-
-    /* Mirror several data structures on the device */
-    cl_mem phiR_d;
-    cl_mem phiI_d;
-    cl_mem phiMag_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    
-    setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
-    setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
-    phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);
-
-    clStatus = clReleaseMemObject(phiR_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(phiI_d);
-    CHECK_ERROR("clReleaseMemObject")
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-
-  int k;
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-
-  free(phiMag);
-  
-  clStatus = clReleaseKernel(clPrm.clKernel);
-
-  /* GPU section 2 */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    cl_mem x_d;
-    cl_mem y_d;
-    cl_mem z_d;
-    cl_mem Qr_d;
-    cl_mem Qi_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);
-
-    Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
-    Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    clStatus = clReleaseMemObject(x_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(y_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(z_d);
-    CHECK_ERROR("clReleaseMemObject")
-    cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
-    cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  free((void*)clSource[0]);
-
-  clStatus = clReleaseKernel(clPrm.clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
-  clStatus = clReleaseContext(clPrm.clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.c
deleted file mode 100644
index e21a2c9ff7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.c
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-#include "ocl.h"
-
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        printf("Kernel file:\n%s\n", buffer);
-        return buffer;
-}
-
-void clMemSet(clPrmtr* clPrm, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.h
deleted file mode 100644
index 51f152ccc0..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl/ocl.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-typedef struct {
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_kernel clKernel;
-} clPrmtr;
-
-void clMemSet(clPrmtr*, cl_mem, int, size_t);
-char* readFile(const char*);
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/Makefile
deleted file mode 100644
index d896069f4b..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o file.o computeQ.o ocl.o
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -g3 -O3
-APP_CXXFLAGS=-ffast-math -g3 -O3
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.c
deleted file mode 100644
index d9d60f9253..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <malloc.h>
-#include <CL/cl.h>
-#include "ocl.h"
-#include "macros.h"
-#include "computeQ.h"
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm)
-{
-  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
-    phiMagBlocks++;
-  
-  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-
-  cl_int clStatus;
-  clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(cl_mem),&phiR_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(cl_mem),&phiI_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&phiMag_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(int),&numK);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimPhiMagGrid,&DimPhiMagBlock,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-}
-
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm)
-{
-  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
-  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
-    QGrids++;
-  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
-  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
-    QBlocks++;
-
-  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK;
-  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK;
-
-  cl_int clStatus;
-  cl_mem ck;
-  ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
-
-  int QGrid;
-  for (QGrid = 0; QGrid < QGrids; QGrid++) {
-    // Put the tile of K values into constant mem
-    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
-    struct kValues* kValsTile = kVals + QGridBase;
-    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
-
-    clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    
-    clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
-    clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
-    clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-}
-
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.h
deleted file mode 100644
index ec9192201a..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/computeQ.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __COMPUTEQ__
-#define __COMPUTEQ__
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm);
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 		  float** Qr, float** Qi);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.cc b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.cc
deleted file mode 100644
index cbfdb32937..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "file.h"
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.h
deleted file mode 100644
index c6a61ef4cd..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/kernels.cl b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/kernels.cl
deleted file mode 100644
index 7633d9ec12..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/kernels.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "macros.h"
-
-__kernel void
-ComputePhiMag_GPU(__global float* phiR, __global float* phiI, __global float* phiMag, int numK) {
-  int indexK = get_global_id(0);
-  if (indexK < numK) {
-    float real = phiR[indexK];
-    float imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-__kernel void
-ComputeQ_GPU(int numK, int kGlobalIndex,
-	     __global float* x, __global float* y, __global float* z,
-	     __global float* Qr, __global float* Qi, __constant struct kValues* ck) 
-{
-  float sX;
-  float sY;
-  float sZ;
-  float sQr;
-  float sQi;
-
-  // Determine the element of the X arrays computed by this thread
-  int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + get_local_id(0);
-
-  // Read block's X values from global mem to shared mem
-  sX = x[xIndex];
-  sY = y[xIndex];
-  sZ = z[xIndex];
-  sQr = Qr[xIndex];
-  sQi = Qi[xIndex];
-
-  // Loop over all elements of K in constant mem to compute a partial value
-  // for X.
-  int kIndex = 0;
-  if (numK % 2) {
-    float expArg = PIx2 * (ck[0].Kx * sX + ck[0].Ky * sY + ck[0].Kz * sZ);
-    sQr += ck[0].PhiMag * cos(expArg);
-    sQi += ck[0].PhiMag * sin(expArg);
-    kIndex++;
-    kGlobalIndex++;
-  }
-
-  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
-       kIndex += 2, kGlobalIndex += 2) {
-    float expArg = PIx2 * (ck[kIndex].Kx * sX +
-			   ck[kIndex].Ky * sY +
-			   ck[kIndex].Kz * sZ);
-    sQr += ck[kIndex].PhiMag * cos(expArg);
-    sQi += ck[kIndex].PhiMag * sin(expArg);
-
-    int kIndex1 = kIndex + 1;
-    float expArg1 = PIx2 * (ck[kIndex1].Kx * sX +
-			    ck[kIndex1].Ky * sY +
-			    ck[kIndex1].Kz * sZ);
-    sQr += ck[kIndex1].PhiMag * cos(expArg1);
-    sQi += ck[kIndex1].PhiMag * sin(expArg1);
-  }
-
-  Qr[xIndex] = sQr;
-  Qi[xIndex] = sQi;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/macros.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/macros.h
deleted file mode 100644
index a95556541e..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/macros.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __MACROS__
-#define __MACROS__
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 512
-#define KERNEL_Q_THREADS_PER_BLOCK 256
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/main.c
deleted file mode 100644
index 9f29df314d..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/main.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <sys/time.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "ocl.h"
-#include "file.h"
-#include "macros.h"
-#include "computeQ.h"
-
-static void
-setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  *dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer");
-  clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnequeueWriteBuffer");
-}
-
-static void
-cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-  clStatus = clReleaseMemObject(*dev_ptr);
-  CHECK_ERROR("clReleaseMemObject")
-}
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-  
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  clPrmtr clPrm;
-
-  cl_int clStatus;
-  cl_platform_id cpPlatform;
-  clStatus = clGetPlatformIDs(1,&cpPlatform,NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)cpPlatform,0};
-
-  cl_device_id cdDevice;
-  clStatus = clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&cdDevice,NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  clPrm.clContext = clCreateContextFromType(cps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,cdDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
-
-  const char* clSource[] = {readFile("src/opencl_nvidia/kernels.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char options[50];
-  sprintf(options,"-I src/opencl_nvidia");
-  clStatus = clBuildProgram(clProgram,1,&cdDevice,options,NULL,NULL);
-  CHECK_ERROR("clBuildProgram")
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  /* GPU section 1 (precompute PhiMag) */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")    
-  
-    // Query binary PTX size
-    size_t binarySize;
-    clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES,
-        sizeof(binarySize), &binarySize, NULL);
-    CHECK_ERROR("clGetProgramInfo");
-
-    printf("Binary Size = %ld\n", binarySize);
-    //  Read binary (PTX file) to memory buffer
-    unsigned char *binary = (unsigned char*) malloc(binarySize);
-    clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned
-          char*), &binary, NULL);
-    CHECK_ERROR("clGetProgramInfo");
-
-    // Save PTX file to mysgemm.ptx
-    FILE* fp = fopen("mri-q.nvptx.s", "wb");
-    fwrite(binary, sizeof(char), binarySize, fp);
-    fclose(fp);
-    free(binary);
-
-
-    /* Mirror several data structures on the device */
-    cl_mem phiR_d;
-    cl_mem phiI_d;
-    cl_mem phiMag_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    
-    setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
-    setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
-    phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);
-
-    clStatus = clReleaseMemObject(phiR_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(phiI_d);
-    CHECK_ERROR("clReleaseMemObject")
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-
-  int k;
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-
-  free(phiMag);
-  
-  clStatus = clReleaseKernel(clPrm.clKernel);
-
-  /* GPU section 2 */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    cl_mem x_d;
-    cl_mem y_d;
-    cl_mem z_d;
-    cl_mem Qr_d;
-    cl_mem Qi_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);
-
-    Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
-    Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    clStatus = clReleaseMemObject(x_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(y_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(z_d);
-    CHECK_ERROR("clReleaseMemObject")
-    cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
-    cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  free((void*)clSource[0]);
-
-  clStatus = clReleaseKernel(clPrm.clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
-  clStatus = clReleaseContext(clPrm.clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.c
deleted file mode 100644
index 2e6b21b908..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-#include "ocl.h"
-
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
-}
-
-void clMemSet(clPrmtr* clPrm, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
-}
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.h
deleted file mode 100644
index 51f152ccc0..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/opencl_nvidia/ocl.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-typedef struct {
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_kernel clKernel;
-} clPrmtr;
-
-void clMemSet(clPrmtr*, cl_mem, int, size_t);
-char* readFile(const char*);
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/visc/Makefile b/hpvm/test/parboil/benchmarks/mri-q/src/visc/Makefile
deleted file mode 100644
index 7fe0c5ed24..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/visc/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-VISC_OBJS=mri-q.visc.ll
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3 
diff --git a/hpvm/test/parboil/benchmarks/mri-q/src/visc/macros.h b/hpvm/test/parboil/benchmarks/mri-q/src/visc/macros.h
deleted file mode 100644
index 9ba2c5a3be..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/src/visc/macros.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __MACROS__
-#define __MACROS__
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 256 /* 512 */
-#define KERNEL_Q_THREADS_PER_BLOCK 256
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/mri-q/tools/compare-output b/hpvm/test/parboil/benchmarks/mri-q/tools/compare-output
deleted file mode 100755
index 1e7cbe8de7..0000000000
--- a/hpvm/test/parboil/benchmarks/mri-q/tools/compare-output
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python
-
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-import sys
-sys.path.insert(0, '../../common/python')
-
-import itertools
-import filecompare as fc
-import binaryfilecompare as bfc
-
-def compare_floats(ref_list, cmp_list):
-	# Lists should be the same length
-	if len(ref_list) != len(cmp_list): return False
-
-	# Absolute tolerance is 0.01% of the maximum value
-	# in the reference data
-	abstol = 1e-4 * max([abs(x) for x in ref_list])
-
-	# Numbers should be equal with a tolerance of 0.2%
-	# or the absolute tolerance, whichever is greater.
-	for (r, c) in zip(ref_list, cmp_list):
-		diff = abs(r - c)
-		if not (diff <= abstol or diff < 0.002 * abs(r)):
-			print r, c
-			# Floats mismatch
-			return False
-
-	# All numbers are within tolerance
-	return True
-
-size_err = "Output data size does not match expected size\n"
-recon_err = "Reconstructed image does not match the expected image\n"
-
-def compare_array(count):
-	# The file contains an array of real values followed by an
-	# array of imaginary values.  Compare them both.
-	return fc.Compare(bfc.many_float(2*count),
-		equal=compare_floats,
-		message=recon_err)
-
-comparison = fc.Then(
-	fc.Bind(fc.Compare(bfc.uint32, message=size_err), compare_array),
-	fc.Compare(bfc.eof))
-
-fc.default_main(comparison)
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/Makefile b/hpvm/test/parboil/benchmarks/nodeSwap/Makefile
deleted file mode 100644
index 67fea898b6..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = nodeSwap
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = opencl
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = small
-endif
-
-ifeq ($(PLATFORM),)
-PLATFORM=default
-endif
-
-ifeq ($(ROWM),1)
-  APP_CFLAGS=-DROWM
-  APP_CXXFLAGS=-DROWM
-  APP_ISPCFLAGS=-DROWM
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
-MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/matrix3.txt
-RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/matrix3.txt
-
-ARGS = -i $(MATRIX1),$(MATRIX2) -o $(OUTPUT)
-TOOL = diff
-
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/Makefile b/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/Makefile
deleted file mode 100644
index 0773bdabea..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o io.o #compute_gold.o
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS+=-ffast-math -O3
-APP_CXXFLAGS+=-ffast-math -O3
-KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/io.cc b/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/io.cc
deleted file mode 100644
index 0459837223..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/io.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include<fstream>
-#include<iostream>
-#include<vector>
-
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
-
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
-
-	char* buffer;
-
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
-
-	return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
-
-  float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-
-}
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel.cl b/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel.cl
deleted file mode 100644
index 570943930d..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void matAdd( __global float *A, __global float *B, __global float* C, int dim )
-{
-    int i = get_group_id(0);
-    int j = get_local_id(0);
-
-    C[i*dim+j] = A[i*dim+j] + B[i*dim+j];
-}
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl b/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
deleted file mode 100644
index 700f15f8ae..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void matAdd( __global float *A, __global float *B, __global float* C, int m, int n )
-{
-/*#ifdef ROWM*/
-    /*int i = get_group_id(0);*/
-    /*int j = get_local_id(0);*/
-/*#else*/
-    /*int i = get_local_id(0);*/
-    /*int j = get_group_id(0);*/
-/*#endif*/
-    /*C[i*n+j] = A[i*n+j] + B[i*n+j];*/
-#ifdef ROWM
-    int i = get_global_id(0);
-    for(int j=0; j < n; j++)
-      C[i*n+j] = A[i*n+j] + B[i*n+j];
-#else
-    int j = get_global_id(0);
-    for(int i=0; i < m; i++)
-      C[i*n+j] = A[i*n+j] + B[i*n+j];
-#endif
-}
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/main.cc b/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/main.cc
deleted file mode 100644
index 23d2950c2a..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/src/opencl/main.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <malloc.h>
-#include <vector>
-#include <iostream>
-#include <cassert>
-#include <CL/cl.h>
-#include <parboil.h>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
-
-// Parameters of tile sizes
-#define TILE_SZ 16
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
-
-void basicSgemm( int m, int n, cl_mem A, cl_mem B, cl_mem C, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      << "; n should be multiple of " << TILE_SZ << std::endl;
-  }
-
-//#ifdef ROWM
-  //size_t db = m;
-  //size_t dg = (m*n);
-//#else
-  //size_t db = n;
-  //size_t dg = (m*n);
-//#endif
-#ifdef ROWM
-  size_t dg = m;
-#else
-  size_t dg = n;
-#endif
-  cl_int clStatus;
-  //std::cout << "Block dim = " << db << ", Group dim = " << dg/db << "\n";
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&m);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&n);
-  CHECK_ERROR("clSetKernelArg")
-
-  //clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,&db,0,NULL,NULL);
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,NULL,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-
-  clStatus = clFinish(clCommandQueue); 
-  CHECK_ERROR("clFinish")
-}
-
-int main (int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matB;
-
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
-
-  /* Read in data */
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[1],
-      matBrow, matBcol, matB);
-
-  assert(matArow == matBrow && matAcol == matBcol && "Dimensions of two input matrices should match");
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-  cl_int clStatus;
-  cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-   
-  cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-
-  // const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
-  // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  cl_kernel clKernel;
-  cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_default/kernel_offline.nvptx.s", "matAdd", &clContext, &clDevice, &clProgram, &clKernel);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
-
-  //char clOptions[50];
-  //sprintf(clOptions,"");
-
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
-
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
-
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matB.front(),0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
-
-  // Use standard sgemm interface
-  basicSgemm(matArow, matAcol, dA, dB, dC, clKernel, clCommandQueue);
-
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
-
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseMemObject(dA);
-  clStatus = clReleaseMemObject(dB);
-  clStatus = clReleaseMemObject(dC);
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  
-  if (params->outFile) {
-   
-    /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
-  }
-
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-  pb_FreeParameters(params);
-
-  //free((void*)clSource[0]);
-
- 
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/nodeSwap/tools/compare-output b/hpvm/test/parboil/benchmarks/nodeSwap/tools/compare-output
deleted file mode 100755
index 7ed53e5d98..0000000000
--- a/hpvm/test/parboil/benchmarks/nodeSwap/tools/compare-output
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-
-# (c) Copyright 2010 The Board of Trustees of the University of Illinois.
-
-import sys
-sys.path.insert(0, '../../common/python')
-
-import itertools
-
-import filecompare as fc
-import textfilecompare as tfc
-
-def compare_floats(ref_list, cmp_list):
-
-	# Lists should be the same length
-	if len(ref_list) != len(cmp_list):
-	  print "Different in length"
-	  print "ref=" + str(len(ref_list)) +" "+ str(ref_list[-2])
-	  print "cmp=" + str(len(cmp_list)) +" "+ str(cmp_list[-1])
-	  return False
-
-
-	# Numbers should be equal with a tolerance of 1%
-	# or 0.01, whichever is greater.
-	for (r, c) in zip(ref_list, cmp_list):
-		diff = abs(r - c)
-		if not (diff < 0.01 or diff < 0.01 * abs(r)):
-			# Floats mismatch
-			return False
-
-	# All numbers are within tolerance
-	return True
-
-err = "Computed values do not match the expected values\n"
-
-comparison = fc.Then(
-	fc.Compare(tfc.floats, equal=compare_floats, message=err),
-	fc.Compare(tfc.eof)
-	)
-
-fc.default_main(comparison)
diff --git a/hpvm/test/parboil/benchmarks/sad/DESCRIPTION b/hpvm/test/parboil/benchmarks/sad/DESCRIPTION
deleted file mode 100644
index 6c235e0f49..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/DESCRIPTION
+++ /dev/null
@@ -1,3 +0,0 @@
-A "sum of absolute differences" benchmark.  This benchmark is based on the full-pixel motion estimation algorithm found in the JM reference H.264 video encoder.  Motion estimation searches for blocks in one image that approximately match blocks in another image.  This benchmark computes SADs for pairs of blocks, where an SAD is one metric for how closely two images match.
-
-There are three kernels.  One kernel computes SADs for 4-by-4 blocks.  The next kernel consumes the first kernel's results to compute SADs for larger blocks, up to 8-by-8.  The last kernel computes SADs for blocks up to 16-by-16.  Each kernel uses the previous kernel's output.
diff --git a/hpvm/test/parboil/benchmarks/sad/Makefile b/hpvm/test/parboil/benchmarks/sad/Makefile
deleted file mode 100644
index 7967afcff3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
-APP = sad
-
-# Default compile visc
-ifeq ($(VERSION),)
-  VERSION = visc
-endif
-
-# Default use small test case
-ifeq ($(TEST),)
-  TEST = default
-endif
-
-BIN = $(addsuffix -$(VERSION), $(APP))
-
-SRCDIR = src/$(VERSION)
-BUILDDIR = build/$(VERSION)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
-
-ifeq ($(TEST),default)
-  INPUT1 = $(DATASET_DIR)/default/input/reference.bin
-  INPUT2 = $(DATASET_DIR)/default/input/frame.bin
-  REF_OUTPUT = $(DATASET_DIR)/default/output/out.bin
-  RUNDIR = run/$(VERSION)/default
-  OUTPUT = $(RUNDIR)/out.bin
-else
-  INPUT1 = $(DATASET_DIR)/large/input/reference.bin
-  INPUT2 = $(DATASET_DIR)/large/input/frame.bin
-  REF_OUTPUT = $(DATASET_DIR)/large/output/out.bin
-  RUNDIR = run/$(VERSION)/large
-  OUTPUT = $(RUNDIR)/out.bin
-endif
-
-ARGS = -i $(INPUT1),$(INPUT2) -o $(OUTPUT)
-TOOL = tools/compare-output
-
-include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/Makefile b/hpvm/test/parboil/benchmarks/sad/src/base/Makefile
deleted file mode 100644
index 9ec8fcbd32..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=file.o image.o sad_cpu.o main.o 
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/file.c b/hpvm/test/parboil/benchmarks/sad/src/base/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/file.h b/hpvm/test/parboil/benchmarks/sad/src/base/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/image.c b/hpvm/test/parboil/benchmarks/sad/src/base/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/image.h b/hpvm/test/parboil/benchmarks/sad/src/base/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/main.c b/hpvm/test/parboil/benchmarks/sad/src/base/main.c
deleted file mode 100644
index 667b892c48..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/main.c
+++ /dev/null
@@ -1,318 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-
-#include "sad.h"
-#include "file.h"
-#include "image.h"
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 0;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_size_macroblocks;
-  int image_width_macroblocks;
-  int image_height_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width / 16;
-  image_height_macroblocks = ref_image->height / 16;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  sad4_cpu(sads_computed,
-	   (unsigned short *)cur_image->data,
-	   (unsigned short *)ref_image->data,
-	   ref_image->width / 16, ref_image->height / 16);
-  larger_sads(sads_computed, image_size_macroblocks);
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/sad.h b/hpvm/test/parboil/benchmarks/sad/src/base/sad.h
deleted file mode 100644
index bfd8017fce..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 4 when allocating memory */
-#define MAX_POS_PADDED 1092
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/base/sad_cpu.c b/hpvm/test/parboil/benchmarks/sad/src/base/sad_cpu.c
deleted file mode 100644
index 48ee9f3454..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/base/sad_cpu.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* The slower but more readable SAD code */
-
-#include <stdlib.h>
-#include "sad.h"
-
-static void sad4_one_macroblock(unsigned short *blk_sad,
-				unsigned short *frame,
-				unsigned short *ref,
-				int frame_y,
-				int frame_x,
-				int mb_width,
-				int mb_height);
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height)
-{
-  int mb_x, mb_y, bk_x, bk_y;
-  unsigned int frame_yoff;
-
-  /* Go to the starting offset in blk_sad */
-  blk_sad += SAD_TYPE_7_IX(mb_width * mb_height);
-
-  /* For each block */
-  for (mb_y = 0, frame_yoff = 0;
-       mb_y < mb_height;
-       mb_y++, frame_yoff += 256 * mb_width)
-    {
-      for (mb_x = 0; mb_x < mb_width; mb_x++)
-	{
-	  sad4_one_macroblock
-	    (blk_sad + (mb_y * mb_width + mb_x) * (SAD_TYPE_7_CT * MAX_POS_PADDED),
-	     frame + frame_yoff + mb_x * 16,
-	     ref,
-	     mb_y * 16,
-	     mb_x * 16,
-	     mb_width,
-	     mb_height);
-	}
-    }
-}
-
-void
-sad4_one_macroblock(unsigned short *macroblock_sad,
-		    unsigned short *frame,
-		    unsigned short *ref,
-		    int frame_y,
-		    int frame_x,
-		    int mb_width,
-		    int mb_height)
-{
-  int pos_x, pos_y;
-  int width = mb_width * 16;
-  int height = mb_height * 16;
-  int pos;			/* search position */
-
-  /* Each search position */
-  pos = 0;
-  for (pos_y = -SEARCH_RANGE; pos_y <= SEARCH_RANGE; pos_y++) {
-    for (pos_x = -SEARCH_RANGE; pos_x <= SEARCH_RANGE; pos_x++, pos++) {
-      int blky, blkx;
-
-      /* Each 4x4 block in the macroblock */
-      for (blky = 0; blky < 4; blky++) {
-	for (blkx = 0; blkx < 4; blkx++) {
-	  int y, x;
-	  unsigned short sad = 0;
-
-	  /* Each pixel */
-	  for (y = 0; y < 4; y++) {
-	    for (x = 0; x < 4; x++) {
-	      int ref_x, ref_y;
-	      unsigned int a, b;
-
-	      /* Get reference pixel coordinate,
-	       * clipped to image boundary */
-	      ref_x = frame_x + pos_x + (blkx*4) + x;
-	      if (ref_x < 0) ref_x = 0;
-	      if (ref_x >= width) ref_x = width - 1;
-
-	      ref_y = frame_y + pos_y + (blky*4) + y;
-	      if (ref_y < 0) ref_y = 0;
-	      if (ref_y >= height) ref_y = height - 1;
-
-	      b = ref[ref_y * width + ref_x];
-	      a = frame[(blky * 4 + y) * width + (blkx * 4 + x)];
-
-	      sad += abs(a - b);
-	    }
-	  }
-
-	  /* Save the SAD */
-	  macroblock_sad[MAX_POS_PADDED*(4*blky+blkx) + pos] = sad;
-	}
-      }
-    }
-  }
-}
-
-void larger_sads(unsigned short *sads, int mbs)
-{
-  int macroblock;
-  int block_x, block_y;
-  unsigned short *x, *y;	/* inputs to vector addition */
-  unsigned short *z;		/* output of vector addition */
-  int count;
-
-  for (macroblock = 0; macroblock < mbs; macroblock++)
-    {
-      /* Block type 6 */
-      for (block_y = 0; block_y < 2; block_y++)
-	for (block_x = 0; block_x < 4; block_x++)
-	  {
-	    x = sads + SAD_TYPE_7_IX(mbs) +
-	      macroblock * SAD_TYPE_7_CT * MAX_POS_PADDED +
-	      (8 * block_y + block_x) * MAX_POS_PADDED;
-	    y = x + 4 * MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_6_IX(mbs) +
-	      macroblock * SAD_TYPE_6_CT * MAX_POS_PADDED +
-	      (4 * block_y + block_x) * MAX_POS_PADDED;
-
-	    for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-	  }
-
-      /* Block type 5 */
-      for (block_y = 0; block_y < 4; block_y++)
-	for (block_x = 0; block_x < 2; block_x++)
-	  {
-	    x = sads + SAD_TYPE_7_IX(mbs) +
-	      macroblock * SAD_TYPE_7_CT * MAX_POS_PADDED +
-	      (4 * block_y + 2 * block_x) * MAX_POS_PADDED;
-	    y = x + MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_5_IX(mbs) +
-	      macroblock * SAD_TYPE_6_CT * MAX_POS_PADDED +
-	      (2 * block_y + block_x) * MAX_POS_PADDED;
-
-	    for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-	  }
-
-      /* Block type 4 */
-      for (block_y = 0; block_y < 2; block_y++)
-	for (block_x = 0; block_x < 2; block_x++)
-	  {
-	    x = sads + SAD_TYPE_5_IX(mbs) +
-	      macroblock * SAD_TYPE_5_CT * MAX_POS_PADDED +
-	      (4 * block_y + block_x) * MAX_POS_PADDED;
-	    y = x + 2 * MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_4_IX(mbs) +
-	      macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	      (2 * block_y + block_x) * MAX_POS_PADDED;
-	    
-	    for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-	  }
-      
-      /* Block type 3 */
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED;
-      y = x + 2 * MAX_POS_PADDED;
-      z = sads + SAD_TYPE_3_IX(mbs) +
-	macroblock * SAD_TYPE_3_CT * MAX_POS_PADDED;
-      
-      for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-      y = x + 2 * MAX_POS_PADDED;
-      z = sads + SAD_TYPE_3_IX(mbs) +
-	macroblock * SAD_TYPE_3_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-
-      for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-
-      /* Block type 2 */
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED;
-
-      for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	2 * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-
-      for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-
-      /* Block type 1 */
-      x = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_1_IX(mbs) +
-	macroblock * SAD_TYPE_1_CT * MAX_POS_PADDED;
-
-      for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++;
-    }
-}
-
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/Makefile b/hpvm/test/parboil/benchmarks/sad/src/cpu/Makefile
deleted file mode 100644
index 9ec8fcbd32..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=c
-SRCDIR_OBJS=file.o image.o sad_cpu.o main.o 
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/file.c b/hpvm/test/parboil/benchmarks/sad/src/cpu/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/file.h b/hpvm/test/parboil/benchmarks/sad/src/cpu/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/image.c b/hpvm/test/parboil/benchmarks/sad/src/cpu/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/image.h b/hpvm/test/parboil/benchmarks/sad/src/cpu/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/main.c b/hpvm/test/parboil/benchmarks/sad/src/cpu/main.c
deleted file mode 100644
index ad2d6685e0..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/main.c
+++ /dev/null
@@ -1,318 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-
-#include "sad.h"
-#include "file.h"
-#include "image.h"
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_size_macroblocks;
-  int image_width_macroblocks;
-  int image_height_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width / 16;
-  image_height_macroblocks = ref_image->height / 16;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  sad4_cpu(sads_computed,
-	   (unsigned short *)cur_image->data,
-	   (unsigned short *)ref_image->data,
-	   ref_image->width / 16, ref_image->height / 16);
-  larger_sads(sads_computed, image_size_macroblocks);
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/sad.h b/hpvm/test/parboil/benchmarks/sad/src/cpu/sad.h
deleted file mode 100644
index bfd8017fce..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 4 when allocating memory */
-#define MAX_POS_PADDED 1092
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cpu/sad_cpu.c b/hpvm/test/parboil/benchmarks/sad/src/cpu/sad_cpu.c
deleted file mode 100644
index a73900ee5f..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cpu/sad_cpu.c
+++ /dev/null
@@ -1,294 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* The optimized CPU-only version of the code, for performance measurement. */
-
-#include <stdlib.h>
-#include "sad.h"
-
-static short line[16];
-
-static void sad4_one_macroblock(unsigned short *blk_sad,
-				unsigned short *frame,
-				unsigned short *ref,
-				int org_y,
-				int org_x,
-				int mb_width,
-				int mb_height);
-
-static short *
-create_padded_row (short *ref, int y, int x, int height, int width);
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height)
-{
-  int mb_x, mb_y, bk_x, bk_y;
-  unsigned int frame_yoff;
-
-  /* Go to the starting offset in blk_sad */
-  blk_sad += SAD_TYPE_7_IX(mb_width * mb_height);
-
-  /* For each block */
-  for (mb_y = 0, frame_yoff = 0;
-       mb_y < mb_height;
-       mb_y++, frame_yoff += 256 * mb_width)
-    {
-      for (mb_x = 0; mb_x < mb_width; mb_x++)
-	{
-	  sad4_one_macroblock
-	    (blk_sad + (mb_y * mb_width + mb_x) * (SAD_TYPE_7_CT * MAX_POS_PADDED),
-	     frame + frame_yoff + mb_x * 16,
-	     ref,
-	     mb_y * 16,
-	     mb_x * 16,
-	     mb_width,
-	     mb_height);
-	}
-    }
-}
-
-void
-sad4_one_macroblock(unsigned short *macroblock_sad,
-		    unsigned short *frame,
-		    unsigned short *ref,
-		    int frame_y,
-		    int frame_x,
-		    int mb_width,
-		    int mb_height)
-{
-  unsigned short frame_mb[256];	/* current macroblock in frame */
-  int pos_x, pos_y;
-  int blky, pixy;
-  int width = mb_width * 16;
-  int max_width = width - 17;
-  int height = mb_height * 16;
-  int max_height = height - 17;
-  int range_partly_outside;
-  int pos;			/* search position */
-
-  /* Make a local copy of frame */
-  {
-    int x, y;
-    for (y = 0; y < 16; y++)
-      for (x = 0; x < 16; x++)
-	frame_mb[16*y+x] = frame[width * y + x];
-  }
-
-  if ((frame_x >= SEARCH_RANGE) && (frame_x <= width - SEARCH_RANGE - 17) &&
-      (frame_y >= SEARCH_RANGE) && (frame_y <= height - SEARCH_RANGE - 17))
-    range_partly_outside = 0;
-  else
-    range_partly_outside = 1;
-
-  /* Each search position */
-  pos = 0;
-  for (pos_y = -SEARCH_RANGE; pos_y <= SEARCH_RANGE; pos_y++) {
-    for (pos_x = -SEARCH_RANGE; pos_x <= SEARCH_RANGE; pos_x++, pos++) {
-      int y;
-      int abs_x, abs_y;
-      int do_bounds_check;
-      short *curptr = frame_mb;
-      short *refptr;
-      unsigned short *sad_line = macroblock_sad;
-
-      abs_y = frame_y + pos_y;
-      abs_x = frame_x + pos_x;
-
-      do_bounds_check = range_partly_outside &&
-	!(abs_y >= 0 && abs_y <= max_height &&
-	  abs_x >= 0 && abs_x <= max_width);
-
-      for (blky = 0; blky < 4; blky++)
-	{
-	  int sad0 = 0, sad1 = 0, sad2 = 0, sad3 = 0;
-
-	  for (y = 0; y < 4; y++)
-	    {
-	      refptr = do_bounds_check
-		? create_padded_row(ref, abs_y, abs_x, height, width)
-		: (short *)(ref + abs_y * width + abs_x);
-
-	      abs_y++;
-
-	      sad0 += abs (*refptr++ - *curptr++);
-	      sad0 += abs (*refptr++ - *curptr++);
-	      sad0 += abs (*refptr++ - *curptr++);
-	      sad0 += abs (*refptr++ - *curptr++);
-	      sad1 += abs (*refptr++ - *curptr++);
-	      sad1 += abs (*refptr++ - *curptr++);
-	      sad1 += abs (*refptr++ - *curptr++);
-	      sad1 += abs (*refptr++ - *curptr++);
-	      sad2 += abs (*refptr++ - *curptr++);
-	      sad2 += abs (*refptr++ - *curptr++);
-	      sad2 += abs (*refptr++ - *curptr++);
-	      sad2 += abs (*refptr++ - *curptr++);
-	      sad3 += abs (*refptr++ - *curptr++);
-	      sad3 += abs (*refptr++ - *curptr++);
-	      sad3 += abs (*refptr++ - *curptr++);
-	      sad3 += abs (*refptr++ - *curptr++);
-	    }
-
-	  sad_line[pos] = sad0;
-	  sad_line[MAX_POS_PADDED + pos] = sad1;
-	  sad_line[MAX_POS_PADDED*2 + pos] = sad2;
-	  sad_line[MAX_POS_PADDED*3 + pos] = sad3;
-
-	  sad_line += MAX_POS_PADDED*4;
-	}
-    }
-  }
-}
-
-/* Return a row of 16 pixels starting at offset (x, y).  The row may lie
- * partly outside the image, in which case an appropriate row will be
- * constructed in 'line' and returned.  Otherwise, a reference to the
- * image is returned. */
-static short *
-create_padded_row (short *ref, int y, int x, int height, int width)
-{
-  int i, maxx;
-  short *row;
-
-  if (y < 0) y = 0;
-  else if (y >= height) y = height - 1;
-
-  row = ref + y * width;
-
-  if ((x >= 0) && (x <= width - 16)) return row + x;
-
-  i = 0;
-  /* Pad left edge of image */
-  for (; (x < 0) && (i < 16); x++, i++)
-    line[i] = row[0];
-
-  /* Copy row from image */
-  for (; (x < width) && (i < 16); x++, i++)
-    line[i] = row[x];
-
-  /* Pad right edge of image */
-  for (; i < 16; x++, i++)
-    line[i] = row[width - 1];
-
-  return line;
-}
-
-void larger_sads(unsigned short *sads, int mbs)
-{
-  int macroblock;
-  int block_x, block_y;
-  unsigned short *x, *y;	/* inputs to vector addition */
-  unsigned short *z;		/* output of vector addition */
-  int count;
-
-#if 0
-#define ADD_VECTORS() \
-  for (count = 0; count < MAX_POS; count++) *z++ = *x++ + *y++
-#else
-#define ADD_VECTORS() \
-  for (count = 0; count < (MAX_POS+1)/2; count++,z += 2, x += 2, y += 2) \
-    *(unsigned int *)z = *(unsigned int *)x + *(unsigned int *)y
-#endif
-
-  for (macroblock = 0; macroblock < mbs; macroblock++)
-    {
-      /* Block type 6 */
-      for (block_y = 0; block_y < 2; block_y++)
-	for (block_x = 0; block_x < 4; block_x++)
-	  {
-	    x = sads + SAD_TYPE_7_IX(mbs) +
-	      macroblock * SAD_TYPE_7_CT * MAX_POS_PADDED +
-	      (8 * block_y + block_x) * MAX_POS_PADDED;
-	    y = x + 4 * MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_6_IX(mbs) +
-	      macroblock * SAD_TYPE_6_CT * MAX_POS_PADDED +
-	      (4 * block_y + block_x) * MAX_POS_PADDED;
-
-	    ADD_VECTORS();
-	  }
-
-      /* Block type 5 */
-      for (block_y = 0; block_y < 4; block_y++)
-	for (block_x = 0; block_x < 2; block_x++)
-	  {
-	    x = sads + SAD_TYPE_7_IX(mbs) +
-	      macroblock * SAD_TYPE_7_CT * MAX_POS_PADDED +
-	      (4 * block_y + 2 * block_x) * MAX_POS_PADDED;
-	    y = x + MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_5_IX(mbs) +
-	      macroblock * SAD_TYPE_6_CT * MAX_POS_PADDED +
-	      (2 * block_y + block_x) * MAX_POS_PADDED;
-
-	    ADD_VECTORS();
-	  }
-
-      /* Block type 4 */
-      for (block_y = 0; block_y < 2; block_y++)
-	for (block_x = 0; block_x < 2; block_x++)
-	  {
-	    x = sads + SAD_TYPE_5_IX(mbs) +
-	      macroblock * SAD_TYPE_5_CT * MAX_POS_PADDED +
-	      (4 * block_y + block_x) * MAX_POS_PADDED;
-	    y = x + 2 * MAX_POS_PADDED;
-	    z = sads + SAD_TYPE_4_IX(mbs) +
-	      macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	      (2 * block_y + block_x) * MAX_POS_PADDED;
-	    
-	    ADD_VECTORS();
-	  }
-      
-      /* Block type 3 */
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED;
-      y = x + 2 * MAX_POS_PADDED;
-      z = sads + SAD_TYPE_3_IX(mbs) +
-	macroblock * SAD_TYPE_3_CT * MAX_POS_PADDED;
-      
-      ADD_VECTORS();
-
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-      y = x + 2 * MAX_POS_PADDED;
-      z = sads + SAD_TYPE_3_IX(mbs) +
-	macroblock * SAD_TYPE_3_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-
-      ADD_VECTORS();
-
-      /* Block type 2 */
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED;
-
-      ADD_VECTORS();
-
-      x = sads + SAD_TYPE_4_IX(mbs) +
-	macroblock * SAD_TYPE_4_CT * MAX_POS_PADDED +
-	2 * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED +
-	MAX_POS_PADDED;
-
-      ADD_VECTORS();
-
-      /* Block type 1 */
-      x = sads + SAD_TYPE_2_IX(mbs) +
-	macroblock * SAD_TYPE_2_CT * MAX_POS_PADDED;
-      y = x + MAX_POS_PADDED;
-      z = sads + SAD_TYPE_1_IX(mbs) +
-	macroblock * SAD_TYPE_1_CT * MAX_POS_PADDED;
-
-      ADD_VECTORS();
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/Makefile b/hpvm/test/parboil/benchmarks/sad/src/cuda/Makefile
deleted file mode 100644
index 230deb28ec..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=file.o image.o sad4.o largerBlocks.o main.o
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/file.c b/hpvm/test/parboil/benchmarks/sad/src/cuda/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/file.h b/hpvm/test/parboil/benchmarks/sad/src/cuda/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/image.c b/hpvm/test/parboil/benchmarks/sad/src/cuda/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/image.h b/hpvm/test/parboil/benchmarks/sad/src/cuda/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.cu
deleted file mode 100644
index 288cf7dcb7..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "sad.h"
-#include "largerBlocks.h"
-
-typedef struct {
-  unsigned short x;
-  unsigned short y;
-} __align__(4) uhvec;
-
-typedef unsigned int uint;
-
-__global__ void larger_sad_calc_8(unsigned short *blk_sad,
-				  int mb_width,
-				  int mb_height)
-{
-  int tx = threadIdx.y & 1;
-  int ty = threadIdx.y >> 1;
-
-  /* Macroblock and sub-block coordinates */
-  int mb_x = blockIdx.x;
-  int mb_y = blockIdx.y;
-
-  /* Number of macroblocks in a frame */
-  int macroblocks = __mul24(mb_width, mb_height);
-  int macroblock_index = (__mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  unsigned short *bi;
-  unsigned short *bo_6, *bo_5, *bo_4;
-
-  bi = blk_sad
-    + (__mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 16;
-
-  // Block type 6: 4x8
-  bo_6 = blk_sad
-    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 8;
-
-  if (ty < 100) // always true, but improves register allocation
-    {
-      // Block type 5: 8x4
-      bo_5 = blk_sad
-	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 8;
-
-      // Block type 4: 8x8
-      bo_4 = blk_sad
-	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 4;
-    }
-
-  for (search_pos = threadIdx.x; search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      /* Each uint is actually two 2-byte integers packed together.
-       * Only addition is used and there is no chance of integer overflow
-       * so this can be done to reduce computation time. */
-      uint i00 = ((uint *)bi)[search_pos];
-      uint i01 = ((uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
-      uint i11 = ((uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
-
-      ((uint *)bo_6)[search_pos]                  = i00 + i10;
-      ((uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((uint *)bo_5)[search_pos]                  = i00 + i01;
-      ((uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
-      ((uint *)bo_4)[search_pos]                  = (i00 + i01) + (i10 + i11);
-    }
-}
-
-__global__ void larger_sad_calc_16(unsigned short *blk_sad,
-				   int mb_width,
-				   int mb_height)
-{
-  /* Macroblock coordinates */
-  int mb_x = blockIdx.x;
-  int mb_y = blockIdx.y;
-
-  /* Number of macroblocks in a frame */
-  int macroblocks = __mul24(mb_width, mb_height) * MAX_POS_PADDED;
-  int macroblock_index = (__mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  unsigned short *bi;
-  unsigned short *bo_3, *bo_2, *bo_1;
-
-  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
-  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
-
-  // Block type 3: 8x16
-  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
-  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
-
-  // Block type 5: 8x4
-  bo_2 = blk_sad + macroblocks + macroblock_index * 2;
-
-  // Block type 4: 8x8
-  bo_1 = blk_sad + macroblock_index;
-
-  for (search_pos = threadIdx.x; search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      /* Each uint is actually two 2-byte integers packed together.
-       * Only addition is used and there is no chance of integer overflow
-       * so this can be done to reduce computation time. */
-      uint i00 = ((uint *)bi)[search_pos];
-      uint i01 = ((uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
-      uint i11 = ((uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
-
-      ((uint *)bo_3)[search_pos]                  = i00 + i10;
-      ((uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((uint *)bo_2)[search_pos]                  = i00 + i01;
-      ((uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
-      ((uint *)bo_1)[search_pos]                  = (i00 + i01) + (i10 + i11);
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.h b/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.h
deleted file mode 100644
index 13006271a8..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/largerBlocks.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-__global__ void larger_sad_calc_8(unsigned short*, int, int);
-__global__ void larger_sad_calc_16(unsigned short*, int, int);
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/main.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda/main.cu
deleted file mode 100644
index 601f904578..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/main.cu
+++ /dev/null
@@ -1,406 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <cuda.h>
-
-#include "sad.h"
-#include "sad4.h"
-#include "largerBlocks.h"
-#include "file.h"
-#include "image.h"
-
-#define CUDA_ERRCK \
-  {cudaError_t err = cudaGetLastError(); \
-    if (err) fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); \
-  }
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  {
-    struct cudaArray *ref_ary;  /* Reference image on the device */
-    short *d_cur_image;         /* Current image on the device */
-    unsigned short *d_sads;     /* SADs on the device */
-    dim3 macroblock_grid(image_width_macroblocks, image_height_macroblocks);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    cudaMalloc((void **)&d_cur_image, image_size_bytes);
-    CUDA_ERRCK
-    cudaMallocArray(&ref_ary, &get_ref().channelDesc,
-                    ref_image->width, ref_image->height);
-    CUDA_ERRCK
-
-    /* Transfer current image to device */
-    cudaMemcpy(d_cur_image, cur_image->data, image_size_bytes,
-               cudaMemcpyHostToDevice);
-    CUDA_ERRCK
-
-    /* Transfer reference image to device */
-    cudaMemcpy2DToArray(ref_ary,
-                        0, 0,
-                        ref_image->data,
-                        ref_image->width * sizeof(unsigned short),
-                        ref_image->width * sizeof(unsigned short),
-                        ref_image->height,
-                        cudaMemcpyHostToDevice);
-    CUDA_ERRCK
-    cudaBindTextureToArray(get_ref(), ref_ary);
-    CUDA_ERRCK
-
-    /* Allocate SAD data on the device */
-    cudaMalloc((void **)&d_sads, 41 * MAX_POS_PADDED * image_size_macroblocks *
-	       sizeof(unsigned short));
-    CUDA_ERRCK
-    cudaMemset(d_sads, 0, 41 * MAX_POS_PADDED * image_size_macroblocks *
-	       sizeof(unsigned short));
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    /* Run the 4x4 kernel */
-    mb_sad_calc<<<dim3(CEIL(ref_image->width / 4, THREADS_W),
-		       CEIL(ref_image->height / 4, THREADS_H)),
-      dim3(CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H),
-      SAD_LOC_SIZE_BYTES>>>
-      (d_sads,
-       (unsigned short *)d_cur_image,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-
-    /* Run the larger-blocks kernels */
-    larger_sad_calc_8<<<macroblock_grid, dim3(32, 4)>>>
-      (d_sads,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-
-    larger_sad_calc_16<<<macroblock_grid, dim3(32, 1)>>>
-      (d_sads,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Transfer SAD data to the host */
-    cudaMemcpy(sads_computed,// + 25 * MAX_POS_PADDED * image_size_macroblocks,
-	       d_sads,// + 25 * MAX_POS_PADDED * image_size_macroblocks,
-	       41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short)
-,
-           cudaMemcpyDeviceToHost);
-    CUDA_ERRCK
-
-    /* Free GPU memory */
-    cudaFree(d_sads);
-    CUDA_ERRCK
-    cudaUnbindTexture(get_ref());
-    CUDA_ERRCK
-    cudaFreeArray(ref_ary);
-    CUDA_ERRCK
-    cudaFree(d_cur_image);
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  }
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad.h b/hpvm/test/parboil/benchmarks/sad/src/cuda/sad.h
deleted file mode 100644
index 3374fa0441..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 8 when allocating memory */
-#define MAX_POS_PADDED 1096
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.cu
deleted file mode 100644
index a4d245609f..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.cu
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "sad.h"
-#include "sad4.h"
-
-/* Macros to access temporary frame storage in shared memory */
-#define FRAME_GET(n, x, y) \
-  (frame_loc[((n) << 4) + ((y) << 2) + (x)])
-#define FRAME_PUT_1(n, x, value) \
-  (frame_loc[((n) << 4) + (x)] = value)
-
-/* Macros to access temporary SAD storage in shared memory */
-#define SAD_LOC_GET(blocknum, pos) \
-  (sad_loc[(blocknum) * MAX_POS_PADDED + (pos)])
-#define SAD_LOC_PUT(blocknum, pos, value) \
-  (sad_loc[(blocknum) * MAX_POS_PADDED + (pos)] = (value))
-
-/* When reading from this array, we use an "index" rather than a
-   search position.  Also, the number of array elements is divided by
-   four relative to SAD_LOC_GET() since this is an array of 8byte
-   data, while SAD_LOC_GET() sees an array of 2byte data. */
-#define SAD_LOC_8B_GET(blocknum, ix) \
-  (sad_loc_8b[(blocknum) * (MAX_POS_PADDED/4) + (ix)])
-
-/* The size of one row of sad_loc_8b.  This is the group of elements
- * holding SADs for all search positions for one 4x4 block. */
-#define SAD_LOC_8B_ROW_SIZE (MAX_POS_PADDED/4)
-
-/* The presence of this preprocessor variable controls which
- * of two means of computing the current search position is used. */
-#define SEARCHPOS_RECURRENCE
-
-/* A local copy of the current 4x4 block */
-__shared__ unsigned short frame_loc[THREADS_W * THREADS_H * 16];
-
-/* The part of the reference image that is in the search range */
-texture<unsigned short, 2, cudaReadModeElementType> ref;
-
-/* The local SAD array on the device.  This is an array of short ints.  It is
- * interpreted as an array of 8-byte data for global data transfers. */
-extern __shared__ unsigned short sad_loc[];
-extern __shared__ vec8b sad_loc_8b[];
-
-/* The compute kernel. */
-/* The macros THREADS_W and THREADS_H specify the width and height of the
- * area to be processed by one thread, measured in 4-by-4 pixel blocks.
- * Larger numbers mean more computation per thread block.
- *
- * The macro POS_PER_THREAD specifies the number of search positions for which
- * an SAD is computed.  A larger value indicates more computation per thread,
- * and fewer threads per thread block.  It must be a multiple of 3 and also
- * must be at most 33 because the loop to copy from shared memory uses
- * 32 threads per 4-by-4 pixel block.
- *
- */
-__global__ void mb_sad_calc(unsigned short *blk_sad,
-			    unsigned short *frame,
-			    int mb_width,
-			    int mb_height)
-{
-  int txy_tmp = threadIdx.x / CEIL(MAX_POS, POS_PER_THREAD);
-  int ty = txy_tmp / THREADS_W;
-  int tx = txy_tmp - __umul24(ty, THREADS_W);
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  /* Macroblock and sub-block coordinates */
-  int mb_x = (tx + __umul24(bx, THREADS_W)) >> 2;
-  int mb_y = (ty + __umul24(by, THREADS_H)) >> 2;
-  int block_x = (tx + __umul24(bx, THREADS_W)) & 0x03;
-  int block_y = (ty + __umul24(by, THREADS_H)) & 0x03;
-
-  /* Block-copy data into shared memory.
-   * Threads are grouped into sets of 16, leaving some threads idle. */
-  if ((threadIdx.x >> 4) < (THREADS_W * THREADS_H))
-  {
-    int ty = (threadIdx.x >> 4) / THREADS_W;
-    int tx = (threadIdx.x >> 4) - __umul24(ty, THREADS_W);
-    int tgroup = threadIdx.x & 15;
-
-    /* Width of the image in pixels */
-    int img_width = mb_width*16;
-
-    /* Pixel offset of the origin of the current 4x4 block */
-    int frame_x = (tx + __umul24(bx, THREADS_W)) << 2;
-    int frame_y = (ty + __umul24(by, THREADS_H)) << 2;
-
-    /* Origin in the current frame for this 4x4 block */
-    int cur_o = frame_y * img_width + frame_x;
-
-    /* If this is an invalid 4x4 block, do nothing */
-    if (((frame_x >> 4) < mb_width) && ((frame_y >> 4) < mb_height))
-      {
-	/* Copy one pixel into 'frame' */
-	FRAME_PUT_1(__umul24(ty, THREADS_W) + tx, tgroup,
-		    frame[cur_o + (tgroup >> 2) * img_width + (tgroup & 3)]);
-      }
-  }
-
-  __syncthreads();
-
-  /* If this thread is assigned to an invalid 4x4 block, do nothing */
-  if ((mb_x < mb_width) && (mb_y < mb_height))
-    {
-      /* Pixel offset of the origin of the current 4x4 block */
-      int frame_x = ((mb_x << 2) + block_x) << 2;
-      int frame_y = ((mb_y << 2) + block_y) << 2;
-
-      /* Origin of the search area for this 4x4 block */
-      int ref_x = frame_x - SEARCH_RANGE;
-      int ref_y = frame_y - SEARCH_RANGE;
-
-      /* Origin in the current frame for this 4x4 block */
-      int cur_o = ty * THREADS_W + tx;
-
-      int search_pos;
-      int search_pos_base =
-	(threadIdx.x % CEIL(MAX_POS, POS_PER_THREAD)) * POS_PER_THREAD;
-      int search_pos_end = search_pos_base + POS_PER_THREAD;
-
-      int sotmp = search_pos_base / SEARCH_DIMENSION;
-      int local_search_off_x = search_pos_base - TIMES_DIM_POS(sotmp);
-      int search_off_y = ref_y + sotmp;
-
-      /* Don't go past bounds */
-      if (search_pos_end > MAX_POS)
-	search_pos_end = MAX_POS;
-
-      /* For each search position, within the range allocated to this thread */
-      for (search_pos = search_pos_base;
-	   search_pos < search_pos_end;
-	   search_pos += 3) {
-	/* It is also beneficial to fuse (jam) the enclosed loops if this loop
-	 * is unrolled. */
-	unsigned short sad1 = 0, sad2 = 0, sad3 = 0;
-	int search_off_x = ref_x + local_search_off_x;
-
-	/* 4x4 SAD computation */
-	for(int y=0; y<4; y++) {
-	  int t;
-	  t = tex2D(ref, search_off_x, search_off_y + y);
-	  sad1 += abs(t - FRAME_GET(cur_o, 0, y));
-
-	  t = tex2D(ref, search_off_x + 1, search_off_y + y);
-	  sad1 += abs(t - FRAME_GET(cur_o, 1, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 0, y));
-
-	  t = tex2D(ref, search_off_x + 2, search_off_y + y);
-	  sad1 += abs(t - FRAME_GET(cur_o, 2, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 1, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 0, y));
-
-	  t = tex2D(ref, search_off_x + 3, search_off_y + y);
-	  sad1 += abs(t - FRAME_GET(cur_o, 3, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 2, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 1, y));
-
-	  t = tex2D(ref, search_off_x + 4, search_off_y + y);
-	  sad2 += abs(t - FRAME_GET(cur_o, 3, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 2, y));
-
-	  t = tex2D(ref, search_off_x + 5, search_off_y + y);
-	  sad3 += abs(t - FRAME_GET(cur_o, 3, y));
-	}
-
-	/* Save this value into the local SAD array */
-	SAD_LOC_PUT(__umul24(ty, THREADS_W) + tx, search_pos, sad1);
-	SAD_LOC_PUT(__umul24(ty, THREADS_W) + tx, search_pos+1, sad2);
-	SAD_LOC_PUT(__umul24(ty, THREADS_W) + tx, search_pos+2, sad3);
-
-	local_search_off_x += 3;
-	if (local_search_off_x >= SEARCH_DIMENSION)
-	  {
-	    local_search_off_x -= SEARCH_DIMENSION;
-	    search_off_y++;
-	  }
-      }
-    }
-
-  __syncthreads();
-
-  /* Block-copy data into global memory.
-   * Threads are grouped into sets of 32, leaving some threads idle. */
-  if ((threadIdx.x >> 5) < (THREADS_W * THREADS_H))
-  {
-    int tgroup = threadIdx.x & 31;
-    int ty = (threadIdx.x >> 5) / THREADS_W;
-    int tx = (threadIdx.x >> 5) - __umul24(ty, THREADS_W);
-    int index;
-
-    /* Macroblock and sub-block coordinates */
-    int mb_x = (tx + __umul24(bx, THREADS_W)) >> 2;
-    int mb_y = (ty + __umul24(by, THREADS_H)) >> 2;
-    int block_x = (tx + __umul24(bx, THREADS_W)) & 0x03;
-    int block_y = (ty + __umul24(by, THREADS_H)) & 0x03;
-
-    if ((mb_x < mb_width) && (mb_y < mb_height))
-      {
-	/* All SADs from this thread are stored in a contiguous chunk
-	 * of memory starting at this offset */
-	blk_sad += (__umul24(__umul24(mb_width, mb_height), 25) +
-		    (__umul24(mb_y, mb_width) + mb_x) * 16 +
-		    (4 * block_y + block_x)) *
-	  MAX_POS_PADDED;
-
-	/* Block copy, 32 threads at a time */
-	for (index = tgroup; index < SAD_LOC_8B_ROW_SIZE; index += 32)
-	  ((vec8b *)blk_sad)[index] 
-	    = SAD_LOC_8B_GET(__umul24(ty, THREADS_W) + tx, index);
-      }
-  }
-}
-
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void)
-{
-  return ref;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.h b/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.h
deleted file mode 100644
index a2f5c18327..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda/sad4.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Integer ceiling division.  This computes ceil(x / y) */
-#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
-
-/* Fast multiplication by 33 */
-#define TIMES_DIM_POS(x) (((x) << 5) + (x))
-
-/* Amount of dynamically allocated local storage
- * measured in bytes, 2-byte words, and 8-byte words */
-#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
-#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
-#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
-
-/* The search position index space is distributed across threads
- * and across time. */
-/* This many search positions are calculated by each thread.
- * Note: the optimized kernel requires that this number is
- * divisible by 3. */
-#define POS_PER_THREAD 18
-
-/* The width and height (in number of 4x4 blocks) of a tile from the
- * current frame that is computed in a single thread block. */
-#define THREADS_W 1
-#define THREADS_H 1
-
-// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
-#define TIMES_THREADS_W(x) ((x) * THREADS_W)
-
-/* This structure is used for vector load/store operations. */
-struct vec8b {
-  int fst;
-  int snd;
-} __align__(8);
-
-typedef struct vec8b vec8b;
-
-/* 4-by-4 SAD computation on the device. */
-__global__ void mb_sad_calc(unsigned short*,
-			    unsigned short*,
-			    int, int);
-
-/* A function to get a reference to the "ref" texture, because sharing
- * of textures between files isn't really supported. */
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/Makefile b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/Makefile
deleted file mode 100644
index 230deb28ec..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=cuda
-SRCDIR_OBJS=file.o image.o sad4.o largerBlocks.o main.o
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.c b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.h b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.c b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.h b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.cu
deleted file mode 100644
index 940e107580..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "sad.h"
-#include "largerBlocks.h"
-#include <stdio.h>
-
-
-typedef struct {
-  unsigned short x;
-  unsigned short y;
-} __align__(4) uhvec;
-
-typedef unsigned int uint;
-
-__global__ void larger_sad_calc_8(unsigned short *blk_sad,
-				  int mb_width,
-				  int mb_height)
-{
-  int tx = threadIdx.y & 1;
-  int ty = threadIdx.y >> 1;
-
-  /* Macroblock and sub-block coordinates */
-  int mb_x = blockIdx.x;
-  int mb_y = blockIdx.y;
-
-  /* Number of macroblocks in a frame */
-  int macroblocks = __mul24(mb_width, mb_height);
-  int macroblock_index = (__mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  unsigned short *bi;
-  unsigned short *bo_6, *bo_5, *bo_4;
-
-  bi = blk_sad
-    + (__mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 16;
-
-  // Block type 6: 4x8
-  bo_6 = blk_sad
-    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 8;
-
-  if (ty < 100) // always true, but improves register allocation
-    {
-      // Block type 5: 8x4
-      bo_5 = blk_sad
-	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 8;
-
-      // Block type 4: 8x8
-      bo_4 = blk_sad
-	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 4;
-    }
-
-  for (search_pos = threadIdx.x; search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      /* Each uint is actually two 2-byte integers packed together.
-       * Only addition is used and there is no chance of integer overflow
-       * so this can be done to reduce computation time. */
-      uint i00 = ((uint *)bi)[search_pos];
-      uint i01 = ((uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
-      uint i11 = ((uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
-
-      ((uint *)bo_6)[search_pos]                  = i00 + i10;
-      ((uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((uint *)bo_5)[search_pos]                  = i00 + i01;
-      ((uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
-      ((uint *)bo_4)[search_pos]                  = (i00 + i01) + (i10 + i11);
-    }
-}
-
-__global__ void larger_sad_calc_16(unsigned short *blk_sad,
-				   int mb_width,
-				   int mb_height)
-{
-  /* Macroblock coordinates */
-  int mb_x = blockIdx.x;
-  int mb_y = blockIdx.y;
-
-  /* Number of macroblocks in a frame */
-  int macroblocks = __mul24(mb_width, mb_height) * MAX_POS_PADDED;
-  int macroblock_index = (__mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  unsigned short *bi;
-  unsigned short *bo_3, *bo_2, *bo_1;
-
-  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
-  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
-
-  // Block type 3: 8x16
-  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
-  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
-
-  // Block type 5: 8x4
-  bo_2 = blk_sad + macroblocks + macroblock_index * 2;
-
-  // Block type 4: 8x8
-  bo_1 = blk_sad + macroblock_index;
-
-  for (search_pos = threadIdx.x; search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      /* Each uint is actually two 2-byte integers packed together.
-       * Only addition is used and there is no chance of integer overflow
-       * so this can be done to reduce computation time. */
-      uint i00 = ((uint *)bi)[search_pos];
-      uint i01 = ((uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
-      uint i11 = ((uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
-      
-      ((uint *)bo_3)[search_pos]                  = i00 + i10;
-      ((uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((uint *)bo_2)[search_pos]                  = i00 + i01;
-      ((uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
-      ((uint *)bo_1)[search_pos]                  = (i00 + i01) + (i10 + i11);
- /*
-      ushort2 s00 = { bi[search_pos*2], bi[search_pos*2+1] };
-      ushort2 s01 = { bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1] };
-      ushort2 s10 = { bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1] };
-      ushort2 s11 = { bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1] };
-
-      ((ushort2 *)bo_3)[search_pos]                  = make_ushort2(s00.x + s10.x, s00.y + s10.y);
-      ((ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = make_ushort2(s01.x + s11.x, s01.y + s11.y);
-      ((ushort2 *)bo_2)[search_pos]                  = make_ushort2(s00.x + s01.x, s00.y + s01.y);
-      ((ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = make_ushort2(s10.x + s11.x, s10.y + s11.y);
-      ((ushort2 *)bo_1)[search_pos]                  = make_ushort2((s00.x + s01.x)+(s10.x + s11.x), (s00.y + s01.y)+(s10.y + s11.y));
-      */
-    }
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.h b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.h
deleted file mode 100644
index 13006271a8..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/largerBlocks.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-__global__ void larger_sad_calc_8(unsigned short*, int, int);
-__global__ void larger_sad_calc_16(unsigned short*, int, int);
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/main.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/main.cu
deleted file mode 100644
index 57e54e47e7..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/main.cu
+++ /dev/null
@@ -1,406 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <cuda.h>
-
-#include "sad.h"
-#include "sad4.h"
-#include "largerBlocks.h"
-#include "file.h"
-#include "image.h"
-
-#define CUDA_ERRCK \
-  {cudaError_t err = cudaGetLastError(); \
-    if (err) fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); \
-  }
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  {
-    struct cudaArray *ref_ary;  /* Reference image on the device */
-    short *d_cur_image;         /* Current image on the device */
-    unsigned short *d_sads;     /* SADs on the device */
-    dim3 macroblock_grid(image_width_macroblocks, image_height_macroblocks);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    cudaMalloc((void **)&d_cur_image, image_size_bytes);
-    CUDA_ERRCK
-    cudaMallocArray(&ref_ary, &get_ref().channelDesc,
-                    ref_image->width, ref_image->height);
-    CUDA_ERRCK
-
-    /* Transfer current image to device */
-    cudaMemcpy(d_cur_image, cur_image->data, image_size_bytes,
-               cudaMemcpyHostToDevice);
-    CUDA_ERRCK
-
-    /* Transfer reference image to device */
-    cudaMemcpy2DToArray(ref_ary,
-                        0, 0,
-                        ref_image->data,
-                        ref_image->width * sizeof(unsigned short),
-                        ref_image->width * sizeof(unsigned short),
-                        ref_image->height,
-                        cudaMemcpyHostToDevice);
-    CUDA_ERRCK
-    cudaBindTextureToArray(get_ref(), ref_ary);
-    CUDA_ERRCK
-
-    /* Allocate SAD data on the device */
-    cudaMalloc((void **)&d_sads, 41 * MAX_POS_PADDED * image_size_macroblocks *
-	       sizeof(unsigned short));
-    CUDA_ERRCK
-    cudaMemset(d_sads, 0, 41 * MAX_POS_PADDED * image_size_macroblocks *
-	       sizeof(unsigned short));
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    // Run the 4x4 kernel
-    mb_sad_calc<<<dim3(CEIL(ref_image->width / 4, THREADS_W),
-		       CEIL(ref_image->height / 4, THREADS_H)),
-      dim3(CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H),
-      SAD_LOC_SIZE_BYTES>>>
-      (d_sads,
-       (unsigned short *)d_cur_image,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-
-    // Run the larger-blocks kernels
-    larger_sad_calc_8<<<macroblock_grid, dim3(32, 4)>>>
-      (d_sads,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-    
-    larger_sad_calc_16<<<macroblock_grid, dim3(32, 1)>>>
-      (d_sads,
-       image_width_macroblocks,
-       image_height_macroblocks);
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Transfer SAD data to the host */
-    cudaMemcpy(sads_computed,// + 25 * MAX_POS_PADDED * image_size_macroblocks,
-	       d_sads,// + 25 * MAX_POS_PADDED * image_size_macroblocks,
-	       41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short)
-,
-           cudaMemcpyDeviceToHost);
-    CUDA_ERRCK
-
-    /* Free GPU memory */
-    cudaFree(d_sads);
-    CUDA_ERRCK
-    cudaUnbindTexture(get_ref());
-    CUDA_ERRCK
-    cudaFreeArray(ref_ary);
-    CUDA_ERRCK
-    cudaFree(d_cur_image);
-    CUDA_ERRCK
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  }
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad.h b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad.h
deleted file mode 100644
index 3374fa0441..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 8 when allocating memory */
-#define MAX_POS_PADDED 1096
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.cu b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.cu
deleted file mode 100644
index 37cc0ef0b5..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.cu
+++ /dev/null
@@ -1,99 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "sad.h"
-#include "sad4.h"
-
-/* The part of the reference image that is in the search range */
-texture<unsigned short, 2, cudaReadModeElementType> ref;
-
-/* The compute kernel. */
-/* The macros THREADS_W and THREADS_H specify the width and height of the
- * area to be processed by one thread, measured in 4-by-4 pixel blocks.
- * Larger numbers mean more computation per thread block.
- *
- * The macro POS_PER_THREAD specifies the number of search positions for which
- * an SAD is computed.  A larger value indicates more computation per thread,
- * and fewer threads per thread block.  It must be a multiple of 3 and also
- * must be at most 33 because the loop to copy from shared memory uses
- * 32 threads per 4-by-4 pixel block.
- *
- */
-__global__ void mb_sad_calc(unsigned short *blk_sad,
-                            unsigned short *frame,
-                            int mb_width,
-                            int mb_height)
-{
-  int tx = (threadIdx.x / CEIL(MAX_POS, POS_PER_THREAD)) % THREADS_W;
-  int ty = (threadIdx.x / CEIL(MAX_POS, POS_PER_THREAD)) / THREADS_W;
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-  int img_width = mb_width*16;
-
-  /* Macroblock and sub-block coordinates */
-  int mb_x = (tx + bx * THREADS_W) >> 2;
-  int mb_y = (ty + by * THREADS_H) >> 2;
-  int block_x = (tx + bx * THREADS_W) & 0x03;
-  int block_y = (ty + by * THREADS_H) & 0x03;
-
-  /* If this thread is assigned to an invalid 4x4 block, do nothing */
-  if ((mb_x < mb_width) && (mb_y < mb_height))
-    {
-      /* Pixel offset of the origin of the current 4x4 block */
-      int frame_x = ((mb_x << 2) + block_x) << 2;
-      int frame_y = ((mb_y << 2) + block_y) << 2;
-
-      /* Origin of the search area for this 4x4 block */
-      int ref_x = frame_x - SEARCH_RANGE;
-      int ref_y = frame_y - SEARCH_RANGE;
-
-      /* Origin in the current frame for this 4x4 block */
-      int cur_o = frame_y * img_width + frame_x;
-
-      int search_pos;
-      int search_pos_base =
-        (threadIdx.x % CEIL(MAX_POS, POS_PER_THREAD)) * POS_PER_THREAD;
-      int search_pos_end = search_pos_base + POS_PER_THREAD;
-
-      /* All SADs from this thread are stored in a contiguous chunk
-       * of memory starting at this offset */
-      blk_sad += mb_width * mb_height * MAX_POS_PADDED * (9 + 16) +
-        (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 +
-        (4 * block_y + block_x) * MAX_POS_PADDED;
-
-      /* Don't go past bounds */
-      if (search_pos_end > MAX_POS)
-        search_pos_end = MAX_POS;
-
-      /* For each search position, within the range allocated to this thread */
-      for (search_pos = search_pos_base;
-           search_pos < search_pos_end;
-           search_pos++) {
-        unsigned short sad4x4 = 0;
-        int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION);
-        int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION);
-
-        /* 4x4 SAD computation */
-        for(int y=0; y<4; y++) {
-          for (int x=0; x<4; x++) {
-            sad4x4 +=
-              abs(tex2D(ref, search_off_x + x, search_off_y + y) -
-                  frame[cur_o + y * img_width + x]);
-          }
-        }
-
-        /* Save this value into the local SAD array */
-        blk_sad[search_pos] = sad4x4;
-      }
-    }
-}
-
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void)
-{
-  return ref;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.h b/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.h
deleted file mode 100644
index a2f5c18327..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/cuda_base/sad4.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Integer ceiling division.  This computes ceil(x / y) */
-#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
-
-/* Fast multiplication by 33 */
-#define TIMES_DIM_POS(x) (((x) << 5) + (x))
-
-/* Amount of dynamically allocated local storage
- * measured in bytes, 2-byte words, and 8-byte words */
-#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
-#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
-#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
-
-/* The search position index space is distributed across threads
- * and across time. */
-/* This many search positions are calculated by each thread.
- * Note: the optimized kernel requires that this number is
- * divisible by 3. */
-#define POS_PER_THREAD 18
-
-/* The width and height (in number of 4x4 blocks) of a tile from the
- * current frame that is computed in a single thread block. */
-#define THREADS_W 1
-#define THREADS_H 1
-
-// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
-#define TIMES_THREADS_W(x) ((x) * THREADS_W)
-
-/* This structure is used for vector load/store operations. */
-struct vec8b {
-  int fst;
-  int snd;
-} __align__(8);
-
-typedef struct vec8b vec8b;
-
-/* 4-by-4 SAD computation on the device. */
-__global__ void mb_sad_calc(unsigned short*,
-			    unsigned short*,
-			    int, int);
-
-/* A function to get a reference to the "ref" texture, because sharing
- * of textures between files isn't really supported. */
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/Makefile
deleted file mode 100644
index fbd81c4965..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=file.o image.o OpenCL_common.o main.o
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.cpp
deleted file mode 100644
index bf1a8b7fb5..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    //fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      if (reqDeviceType != NULL) {
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	cl_uint maxMemAlloc = 0;
-	
-	OCL_ERRCK_RETVAL ( clGetDeviceInfo(	device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-
-	
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.h
deleted file mode 100644
index c51800532d..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/OpenCL_common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.c b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.c b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/main.cpp b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/main.cpp
deleted file mode 100644
index 4948026b7b..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/main.cpp
+++ /dev/null
@@ -1,519 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "sad.h"
-#include "sad_kernel.h"
-#include "file.h"
-#include "image.h"
-#include "OpenCL_common.h"
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-  
-  char oclOverhead[]= "OpenCL Overhead";
-
-  pb_InitializeTimerSet(&timers);
-  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-  
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  // Run the kernel code
-  // ************************************************************************
-  
-  	cl_int ciErrNum;
-  	cl_platform_id clPlatform;
-  	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-
-	cl_kernel mb_sad_calc;
-	cl_kernel larger_sad_calc_8;
-	cl_kernel larger_sad_calc_16;
-	
-	cl_mem imgRef;		/* Reference image on the device */
-	cl_mem d_cur_image;	/* Current image on the device */
-	cl_mem d_sads;		/* SADs on the device */
-
-    // x : image_width_macroblocks
-    // y : image_height_macroblocks
-
-    // Set up OpenCL Context/Platform/Device/CommandQueue
-    int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 1, CL_DEVICE_IMAGE_SUPPORT);
-    if (deviceFound < 0) {
-      fprintf(stderr, "\nError:  No adequate device was found\n\n");
-      exit(1);
-    }
-
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-    clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	
-  	clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-    pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-  	
-  	// Read Source Code File
-  	size_t program_length;
-    const char* source_path = "src/opencl_base/sad_kernel.cl";
-    char* source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-        fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-  	
-  	cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	
-  	free(source);
-    
-    // JIT Compilation Options
-    char compileOptions[1024];
-    //                -cl-nv-verbose
-    sprintf(compileOptions, "\
-                -D MAX_POS=%u -D CEIL_POS=%u\
-                -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
-                -D THREADS_W=%u -D THREADS_H=%u\
-                -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
-                \0",
-                MAX_POS, CEIL(MAX_POS, POS_PER_THREAD),
-                POS_PER_THREAD,   MAX_POS_PADDED,
-                THREADS_W,   THREADS_H,
-                SEARCH_RANGE, SEARCH_DIMENSION
-            ); 
-    
-    OCL_ERRCK_RETVAL( clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL) );
- 	
-   /*	
-   char *build_log;
-       size_t ret_val_size;
-       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
-       build_log = (char *)malloc(ret_val_size+1);
-       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );
-
-       // Null terminate (original writer wasn't sure)
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-   */  
-
-    mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);    
-   	larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);
-   	larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);
-
-    size_t wgSize;
-    size_t comp_wgSize[3];
-    cl_ulong localMemSize;
-    size_t prefwgSizeMult;
-    cl_ulong privateMemSize;
- 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY); 
-    
-
-    cl_image_format img_format;
-    img_format.image_channel_order = CL_R;
-    img_format.image_channel_data_type = CL_UNSIGNED_INT16;
-
-    /* Transfer reference image to device */
-	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, 
-                                      ref_image->width /** sizeof(unsigned short)*/, // width
-                                      ref_image->height, // height
-                                      ref_image->width * sizeof(unsigned short), // row_pitch
-                                      ref_image->data, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);                                      
-	
-    /* Allocate SAD data on the device */
-
-    unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));
-    
-    size_t max_alloc_size = 0;
-    clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, 
-                    sizeof(max_alloc_size), &max_alloc_size, NULL);
-    if (max_alloc_size < (41 * MAX_POS_PADDED * 
-            image_size_macroblocks * sizeof(unsigned short))) {
-      fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n",
-              (int) (max_alloc_size >> 20));
-      exit(-1);
-    }
-    
-    d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), tmpZero, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    free(tmpZero);
-    
-    d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_size_bytes, cur_image->data, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-
-	/* Set Kernel Parameters */	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef) );	
-	
-	size_t mb_sad_calc_localWorkSize[2] = {
-	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
-	    1 };
-	size_t mb_sad_calc_globalWorkSize[2] = {
-        mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W),
-	    mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H) };
-	
-	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
-	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, 
-	  image_height_macroblocks * 4};
-	
-	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
-	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
-	  image_height_macroblocks * 1};
-	
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-    /* Run the 4x4 kernel */	
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, mb_sad_calc_globalWorkSize, mb_sad_calc_localWorkSize, 0, 0, 0) );
-		
-	/* Run the larger-blocks kernels */
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, larger_sad_calc_8_localWorkSize, 0, 0, 0) );
-		
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, larger_sad_calc_16_globalWorkSize, larger_sad_calc_16_localWorkSize, 0, 0, 0) );
-
-    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Transfer SAD data to the host */    
-    OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE, 
-        0, 
-        41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), 
-        sads_computed, 0, NULL, NULL) );
-
-    /* Free GPU memory */
-    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_8) );
-    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_16) );
-    OCL_ERRCK_RETVAL( clReleaseProgram(clProgram) );
-
-    OCL_ERRCK_RETVAL( clReleaseMemObject(d_sads) );
-    OCL_ERRCK_RETVAL( clReleaseMemObject(imgRef) );
-    OCL_ERRCK_RETVAL( clReleaseMemObject(d_cur_image) );
-
-    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    
-  // ************************************************************************
-  // End GPU Code
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-  
-  OCL_ERRCK_RETVAL( clReleaseCommandQueue(clCommandQueue) );
-  OCL_ERRCK_RETVAL( clReleaseContext(clContext) );
-  
-  pb_DestroyTimerSet(&timers);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad.h
deleted file mode 100644
index 3374fa0441..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 8 when allocating memory */
-#define MAX_POS_PADDED 1096
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.cl b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.cl
deleted file mode 100644
index fe27242798..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.cl
+++ /dev/null
@@ -1,333 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* The compute kernel. */
-/* The macros THREADS_W and THREADS_H specify the width and height of the
- * area to be processed by one thread, measured in 4-by-4 pixel blocks.
- * Larger numbers mean more computation per thread block.
- *
- * The macro POS_PER_THREAD specifies the number of search positions for which
- * an SAD is computed.  A larger value indicates more computation per thread,
- * and fewer threads per thread block.  It must be a multiple of 3 and also
- * must be at most 33 because the loop to copy from shared memory uses
- * 32 threads per 4-by-4 pixel block.
- *
- */
- 
-// AMD OpenCL fails UINT_CUDA_V
-#if 1
-  #define SHORT2_V 1
-  #define UINT_CUDA_V 0
-#else
-  #define SHORT2_V 0
-  #define UINT_CUDA_V 1
-#endif
-
-// Either works
-#if 0
-  #define VEC_LOAD 1
-  #define CONSTR_LOAD 0
-#else
-  #define VEC_LOAD 0
-  #define CONSTR_LOAD 1
-#endif
-
-// CAST_STORE is only method that works for all implementations of OpenCL tested
-#if 0
-  #define VEC_STORE 1
-  #define CAST_STORE 0
-  #define SCALAR_STORE 0
-#elif 1
-  #define VEC_STORE 0
-  #define CAST_STORE 1
-  #define SCALAR_STORE 0
-#else
-  #define VEC_STORE 0
-  #define CAST_STORE 0
-  #define SCALAR_STORE 1
-#endif
-
-__kernel void mb_sad_calc(__global unsigned short *blk_sad,
-                            __global unsigned short *frame,
-                            int mb_width,
-                            int mb_height,
-                            __read_only image2d_t img_ref)
-{   
-	const sampler_t texSampler =
-	CLK_NORMALIZED_COORDS_FALSE |
-	CLK_ADDRESS_CLAMP_TO_EDGE |
-	CLK_FILTER_NEAREST;
-
-
-  int tx = (get_local_id(0) / CEIL_POS) % THREADS_W;
-  int ty = (get_local_id(0) / CEIL_POS) / THREADS_W;
-  int bx = get_group_id(0);
-  int by = get_group_id(1);
-  int img_width = mb_width*16;
-
-  // Macroblock and sub-block coordinates
-  int mb_x = (tx + bx * THREADS_W) >> 2;
-  int mb_y = (ty + by * THREADS_H) >> 2;
-  int block_x = (tx + bx * THREADS_W) & 0x03;
-  int block_y = (ty + by * THREADS_H) & 0x03;
-
-  // If this thread is assigned to an invalid 4x4 block, do nothing 
-  if ((mb_x < mb_width) && (mb_y < mb_height))
-    {
-      // Pixel offset of the origin of the current 4x4 block
-      int frame_x = ((mb_x << 2) + block_x) << 2;
-      int frame_y = ((mb_y << 2) + block_y) << 2;
-
-      // Origin of the search area for this 4x4 block
-      int ref_x = frame_x - SEARCH_RANGE;
-      int ref_y = frame_y - SEARCH_RANGE;
-
-      // Origin in the current frame for this 4x4 block
-      int cur_o = frame_y * img_width + frame_x;
-
-      int search_pos;
-      int search_pos_base =
-        (get_local_id(0) % CEIL_POS) * POS_PER_THREAD;
-      int search_pos_end = search_pos_base + POS_PER_THREAD;
-
-      // All SADs from this thread are stored in a contiguous chunk
-      // of memory starting at this offset
-      blk_sad += mb_width * mb_height * MAX_POS_PADDED * (9 + 16) +
-        (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 +
-        (4 * block_y + block_x) * MAX_POS_PADDED;
-
-      // Don't go past bounds
-      if (search_pos_end > MAX_POS)
-        search_pos_end = MAX_POS;
-
-      // For each search position, within the range allocated to this thread
-      for (search_pos = search_pos_base;
-           search_pos < search_pos_end;
-           search_pos++) {
-        unsigned short sad4x4 = 0;
-        int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION);
-        int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION);
-
-        // 4x4 SAD computation
-        for(int y=0; y<4; y++) {
-          for (int x=0; x<4; x++) {
-          
-          // ([unsigned] short)read_imageui or
-          //                   read_imagei  is required for correct calculation.
-          // Though read_imagei() is shorter, its results are undefined by specification since the input
-          // is an unsigned type, CL_UNSIGNED_INT16
-          
-            sad4x4 += abs((unsigned short)((read_imageui(img_ref, texSampler, (int2)(search_off_x + x, search_off_y + y) )).x) -
-                  frame[cur_o + y * img_width + x]);
-                  
-          }
-        }
-
-        // Save this value into the local SAD array 
-        blk_sad[search_pos] = sad4x4;
-      }
-    }
-
-}
-
-
-//typedef unsigned int uint;
-
-__kernel void larger_sad_calc_8(__global unsigned short *blk_sad,
-				  int mb_width,
-				  int mb_height)
-{
-  int tx = get_local_id(1) & 1;
-  int ty = get_local_id(1) >> 1;
-
-  // Macroblock and sub-block coordinates
-  int mb_x = get_group_id(0);
-  int mb_y = get_group_id(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height);
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_6, *bo_5, *bo_4;
-
-
-  bi = blk_sad    
-    + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 16;
-
-  // Block type 6: 4x8
-  bo_6 = blk_sad
-    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 8;
-
-  if (ty < 100) // always true, but improves register allocation
-    {
-      // Block type 5: 8x4
-      bo_5 = blk_sad
-	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 8;
-
-      // Block type 4: 8x8
-      bo_4 = blk_sad
-	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 4;
-    }
-
-  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-#if SHORT2_V
-  #if VEC_LOAD
-      ushort2 s00 = vload2(search_pos,                    bi);
-      ushort2 s01 = vload2(search_pos+  MAX_POS_PADDED/2, bi);
-      ushort2 s10 = vload2(search_pos+4*MAX_POS_PADDED/2, bi);
-      ushort2 s11 = vload2(search_pos+5*MAX_POS_PADDED/2, bi);
-  #else
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]);
-  #endif
-
-  #if VEC_STORE
-      ushort2 s0010 = s00 + s10;
-      ushort2 s0111 = s01 + s11;
-      ushort2 s0001 = s00 + s01;
-      ushort2 s1011 = s10 + s11;
-      ushort2 s00011011 = s0001 + s1011;
-      
-      vstore2(s0010, search_pos, bo_6);
-      vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_6);
-      vstore2(s0001, search_pos, bo_5);
-      vstore2(s1011, search_pos+2*MAX_POS_PADDED/2, bo_5);
-      vstore2(s00011011, search_pos, bo_4);
-  #elif CAST_STORE
-      ((__global ushort2 *)bo_6)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_5)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_4)[search_pos]                  = (s00 + s01) + (s10 + s11);
-  #else // SCALAR_STORE
-      bo_6[search_pos*2] = s00.x + s10.x;
-      bo_6[search_pos*2+1] = s00.y + s10.y;
-      bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
-      bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
-      bo_5[search_pos*2] = s00.x + s01.x;
-      bo_5[search_pos*2+1] = s00.y + s01.y;
-      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x;
-      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
-      bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
-      bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
-  #endif
-#else // UINT_CUDA_V
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_6)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_5)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_4)[search_pos]                  = (i00 + i01) + (i10 + i11);
-#endif
-    }
-    
-}
-
-
-
-__kernel void larger_sad_calc_16(__global unsigned short *blk_sad,
-				   int mb_width,
-				   int mb_height)
-{
-  // Macroblock coordinates 
-  int mb_x = get_group_id(0);
-  int mb_y = get_group_id(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED;
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_3, *bo_2, *bo_1;
-
-  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
-  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
-
-  // Block type 3: 8x16
-  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
-  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
-
-  // Block type 5: 8x4
-  bo_2 = blk_sad + macroblocks + macroblock_index * 2;
-
-  // Block type 4: 8x8
-  bo_1 = blk_sad + macroblock_index;
-
-  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-#if SHORT2_V
-  #if VEC_LOAD
-      ushort2 s00 = vload2(search_pos,                    bi);
-      ushort2 s01 = vload2(search_pos+  MAX_POS_PADDED/2, bi);
-      ushort2 s10 = vload2(search_pos+2*MAX_POS_PADDED/2, bi);
-      ushort2 s11 = vload2(search_pos+3*MAX_POS_PADDED/2, bi);
-  #else
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]);
-  #endif
-
-  #if VEC_STORE
-      ushort2 s0010 = s00 + s10;
-      ushort2 s0111 = s01 + s11;
-      ushort2 s0001 = s00 + s01;
-      ushort2 s1011 = s10 + s11;
-      ushort2 s00011011 = s0001 + s1011;
-      
-      vstore2(s0010, search_pos, bo_3);
-      vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_3);
-      vstore2(s0001, search_pos, bo_2);
-      vstore2(s1011, search_pos+MAX_POS_PADDED/2, bo_2);
-      vstore2(s00011011, search_pos, bo_1);
-  #elif CAST_STORE
-      ((__global ushort2 *)bo_3)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_2)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_1)[search_pos]                  = (s00 + s01) + (s10 + s11);
-  #else // SCALAR_STORE
-      bo_3[search_pos*2] = s00.x + s10.x;
-      bo_3[search_pos*2+1] = s00.y + s10.y;
-      bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
-      bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
-      bo_2[search_pos*2] = s00.x + s01.x;
-      bo_2[search_pos*2+1] = s00.y + s01.y;
-      bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x;
-      bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
-      bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
-      bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
-  #endif
-#else // UINT_CUDA_V
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_3)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_2)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_1)[search_pos]                  = (i00 + i01) + (i10 + i11);
-#endif
-    }
-}
-
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.h
deleted file mode 100644
index 4fbf23ef45..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_base/sad_kernel.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Integer ceiling division.  This computes ceil(x / y) */
-#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
-
-/* Fast multiplication by 33 */
-#define TIMES_DIM_POS(x) (((x) << 5) + (x))
-
-/* Amount of dynamically allocated local storage
- * measured in bytes, 2-byte words, and 8-byte words */
-#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
-#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
-#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
-
-/* The search position index space is distributed across threads
- * and across time. */
-/* This many search positions are calculated by each thread.
- * Note: the optimized kernel requires that this number is
- * divisible by 3. */
-#define POS_PER_THREAD 18
-
-/* The width and height (in number of 4x4 blocks) of a tile from the
- * current frame that is computed in a single thread block. */
-#define THREADS_W 1
-#define THREADS_H 1
-
-// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
-#define TIMES_THREADS_W(x) ((x) * THREADS_W)
-
-/* This structure is used for vector load/store operations. */
-
-struct vec8b {
-  int fst;
-  int snd;
-} __attribute__ ((aligned(8)));
-
-
-
-/* 4-by-4 SAD computation on the device. */
-/*
-extern "C" __global__ void mb_sad_calc(unsigned short*,
-			    unsigned short*,
-			    int, int);
-*/
-/* A function to get a reference to the "ref" texture, because sharing
- * of textures between files isn't really supported. */
- /*
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
-
-extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int);
-extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/Makefile
deleted file mode 100644
index aeb963fe16..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=file.o image.o main.o main.o OpenCL_common.o
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.cpp
deleted file mode 100644
index 1056e0117e..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-
-
-#include "OpenCL_common.h"
-#include <string.h>
-
-// -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
-  cl_uint numEntries = 16;
-  cl_platform_id clPlatforms[numEntries];
-  cl_uint numPlatforms;
-  
-  cl_device_id clDevices[numEntries];
-  cl_uint numDevices;
-
-  OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
-  //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
-  bool needDevice = true;
-  
-  for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
-
-    cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    //fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
-      cl_device_id clDevice = clDevices[id];
-      cl_device_type clDeviceType;
-
-      bool canSatisfy = true;
-      
-      if (reqDeviceType != NULL) {
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
-        if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
-          if (*reqDeviceType != clDeviceType) {
-            canSatisfy = false;
-          }
-        }
-      }
-
-      va_list paramList;
-      va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
-        cl_bool clInfoBool;
-        size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
-        if (clInfoBool != true) {
-          canSatisfy = false;
-        }
-      }
-      
-      va_end(paramList);
-      if (canSatisfy) {
-        *device = clDevice;
-        *platform = clPlatform;
-        needDevice = false;
-        if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
-          *reqDeviceType = clDeviceType;
-        }
-      }
-    } // End checking all devices for a platform
-  } // End checking all platforms
-
-  int retVal = -1;
-  if (needDevice) {
-    retVal = -1;
-  } else {
-    retVal = 0;
-  }
-  
-  return retVal;
-
-}
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.h
deleted file mode 100644
index 9fc2696c0d..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/OpenCL_common.h
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#ifndef __OPENCL_COMMON_H_
-#define __OPENCL_COMMON_H_
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <CL/cl.h>
-
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.c b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.c b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main.cpp
deleted file mode 100644
index 543b3d42d4..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main.cpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "sad.h"
-#include "sad_kernel.h"
-#include "file.h"
-#include "image.h"
-#include "OpenCL_common.h"
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  char oclOverhead[] = "OpenCL Overhead";
-
-  pb_InitializeTimerSet(&timers);
-  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-  
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  {
-  	cl_int ciErrNum;
-  	cl_platform_id clPlatform;
-  	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-
-	cl_kernel mb_sad_calc;
-	cl_kernel larger_sad_calc_8;
-	cl_kernel larger_sad_calc_16;
-	
-	cl_mem imgRef;		/* Reference image on the device */
-	cl_mem d_cur_image;	/* Current image on the device */
-	cl_mem d_sads;		/* SADs on the device */
-
-    // x : image_width_macroblocks
-    // y : image_height_macroblocks
-
-    // Set up OpenCL Context/Platform/Device/CommandQueue
-    int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 1, CL_DEVICE_IMAGE_SUPPORT);
-    if (deviceFound < 0) {
-      fprintf(stderr, "\nError:  No adequate device was found\n\n");
-      exit(1);
-    }
-
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-    clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	
-  	clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-  	pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
-  	
-  	// Load Kernel Source Code from File
-  	size_t program_length;
-    const char* source_path = "src/opencl_nvidia/sad_kernel.cl";
-    // Dynamically allocate buffer for source
-    char* source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-        fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-  	
-  	cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  	OCL_ERRCK_VAR(ciErrNum);
-  	  	
-  	free(source);   // Done with file source code
-    
-    // JIT Compilation Options
-    char compileOptions[1024];
-    //                -cl-nv-verbose // Provides register info for NVIDIA devices
-    // Set all Macros referenced by kernels
-    sprintf(compileOptions, "\
-                -D SAD_LOC_SIZE_BYTES=%u\
-                -D MAX_POS=%u -D CEIL_POS=%u\
-                -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
-                -D THREADS_W=%u -D THREADS_H=%u\
-                -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
-                \0",
-                SAD_LOC_SIZE_BYTES,
-                MAX_POS, CEIL(MAX_POS, POS_PER_THREAD),
-                POS_PER_THREAD,   MAX_POS_PADDED,
-                THREADS_W,   THREADS_H,
-                SEARCH_RANGE, SEARCH_DIMENSION
-            ); 
-    
-    OCL_ERRCK_RETVAL( clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL) );
-    
-    /*
-    char *build_log;
-    size_t ret_val_size;
-    OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
-    build_log = (char *)malloc(ret_val_size+1);
-    OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );
-
-    // Null terminate (original writer wasn't sure)
-    build_log[ret_val_size] = '\0';
-
-    fprintf(stderr, "%s\n", build_log );
-    */	
-
-    mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);    
-   	larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);
-   	larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
-   	OCL_ERRCK_VAR(ciErrNum);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY); 
-    
-    cl_image_format img_format;
-    img_format.image_channel_order = CL_R;  // 1 Channel (Note: CL_A isn't correct)
-    img_format.image_channel_data_type = CL_UNSIGNED_INT16;
-
-    // **** Transfer reference image to device ****
-	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, 
-                                      ref_image->width, // width in pixels
-                                      ref_image->height, // height in pixels
-                                      ref_image->width * sizeof(unsigned short), // row_pitch in bytes
-                                      ref_image->data, &ciErrNum); // source data
-    OCL_ERRCK_VAR(ciErrNum);                                      
-	
-    
-
-    // OpenCL does not support memset() and requires a block of memory to copy 
-    unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));
-    
-    // **** Allocate SAD data on the device ****
-    d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, 
-        41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), 
-        tmpZero, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-    free(tmpZero);
-
-    // **** Transfer current image to device ****
-    d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-                                    image_size_bytes, cur_image->data, &ciErrNum);
-    OCL_ERRCK_VAR(ciErrNum);
-
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-
-	// **** Set Kernel Parameters ****
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks) );
-	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks) );
-	
-	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef) );
-	
-	// **** Set Kernel Global and Local Worksizes ****
-	size_t mb_sad_calc_localWorkSize[2] = {
-	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
-	    1 };
-	size_t mb_sad_calc_globalWorkSize[2] = {
-        CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H *
-                CEIL(ref_image->width / 4, THREADS_W),
-	    CEIL(ref_image->height / 4, THREADS_H) };
-	
-	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
-	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, 
-	                                              image_height_macroblocks * 4};
-	
-	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
-	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
-	                                               image_height_macroblocks};
-	
-    //pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-    // **** Run the 4x4 kernel ****
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, 
-	                                    mb_sad_calc_globalWorkSize, 
-	                                    mb_sad_calc_localWorkSize, 0, 0, 0) );
-		
-	// **** Run the larger-blocks kernels ****
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0,
-	                                    larger_sad_calc_8_globalWorkSize,
-	                                    larger_sad_calc_8_localWorkSize, 0, 0, 0) );
-		
-	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, 
-	                                    larger_sad_calc_16_globalWorkSize,
-	                                    larger_sad_calc_16_localWorkSize, 0, 0, 0) );	
-
-    //clFinish(clCommandQueue);   // Synchronize to make timing more like CUDA benchmarks
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    // **** Transfer SAD data to the host ****    
-    OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE, 0, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), sads_computed, 0, NULL, NULL) );
-
-
-    /* Free GPU memory */   
-    OCL_ERRCK_RETVAL( clReleaseKernel(mb_sad_calc) );
-    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_8) );
-    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_16) );
-    OCL_ERRCK_RETVAL( clReleaseProgram(clProgram) );
-    OCL_ERRCK_RETVAL( clReleaseCommandQueue(clCommandQueue) );
-    OCL_ERRCK_RETVAL( clReleaseContext(clContext) );
-    OCL_ERRCK_RETVAL( clReleaseMemObject(d_sads) );
-    OCL_ERRCK_RETVAL( clReleaseMemObject(imgRef) );
-    OCL_ERRCK_RETVAL( clReleaseMemObject(d_cur_image) );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    
-  }
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main_debug.cpp b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main_debug.cpp
deleted file mode 100644
index e1a560c6e7..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/main_debug.cpp
+++ /dev/null
@@ -1,695 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "sad.h"
-#include "sad_kernel.h"
-#include "file.h"
-#include "image.h"
-
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
-}
-  
-#define OCL_ERRCK(s) \
-  { if (s != CL_SUCCESS) fprintf(stderr, "OpenCL Error (Line %d): %s\n", __LINE__, oclErrorString(s)); }
-  
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
-
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_size_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-void
-write_sads(char *filename,
-	   int image_size_macroblocks,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int block;
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write size in macroblocks */
-  write32u(outfile, image_size_macroblocks);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each macroblock */
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int blocktype;
-
-      /* Write SADs for all sub-block types */
-      for (blocktype = 1; blocktype <= 7; blocktype++)
-	write_subblocks(outfile,
-			sads + SAD_TYPE_IX(blocktype, image_size_macroblocks),
-			block,
-			SAD_TYPE_CT(blocktype));
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  /* Run the kernel code */
-  {
-  	cl_int ciErrNum;
-  	cl_platform_id clPlatform;
-  	int deviceType = CL_DEVICE_TYPE_DEFAULT;
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-
-	cl_kernel mb_sad_calc;
-	cl_kernel larger_sad_calc_8;
-	cl_kernel larger_sad_calc_16;
-	
-	cl_mem imgRef;		/* Reference image on the device */
-	cl_mem d_cur_image;	/* Current image on the device */
-	cl_mem d_sads;		/* SADs on the device */
-
-    // x : image_width_macroblocks
-    // y : image_height_macroblocks
-
-/*
-	if (argc > 1) {
-		if (strcmp(argv[1],"-gpu") == 0) {
-			deviceType = CL_DEVICE_TYPE_GPU;
-		} else if (strcmp(argv[1],"-cpu")==0) {
-			deviceType = CL_DEVICE_TYPE_CPU;
-		}
-	}
-*/
-
-	
-    pb_SwitchToTimer(&timers, pb_TimerID_DRIVER);
-
-	ciErrNum = clGetPlatformIDs(1, &clPlatform, NULL); 
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clGetDeviceIDs(clPlatform, deviceType, 1, &clDevice, NULL);
-	OCL_ERRCK(ciErrNum);
-
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0};
-    clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum);
-  	OCL_ERRCK(ciErrNum);
-  	
-  	clCommandQueue = clCreateCommandQueue(clContext, clDevice, 0, &ciErrNum);
-  	OCL_ERRCK(ciErrNum);
-  	
-  	size_t program_length;
-    const char* source_path = "/home/twentz/parboil_nonAC/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl";
-    //const char* source_path = "/afs/crhc.illinois.edu/user/wentz1/ECE598HK/parboil_nonAC/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl";
-    char* source = oclLoadProgSource(source_path, "", &program_length);
-    if(!source) {
-        fprintf(stderr, "Could not load program source\n"); exit(1);
-    }
-  	
-  	cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
-  	OCL_ERRCK(ciErrNum);
-  	
-  	free(source);
-    
-    char compileOptions[1024];
-    //                -cl-nv-verbose
-    sprintf(compileOptions, "\
-                -D SAD_LOC_SIZE_BYTES=%u\
-                -D MAX_POS=%u -D CEIL_POS=%u\
-                -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
-                -D THREADS_W=%u -D THREADS_H=%u\
-                -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
-                ",
-                SAD_LOC_SIZE_BYTES,
-                MAX_POS, CEIL(MAX_POS, POS_PER_THREAD),
-                POS_PER_THREAD,   MAX_POS_PADDED,
-                THREADS_W,   THREADS_H,
-                SEARCH_RANGE, SEARCH_DIMENSION
-            ); 
-    
-    ciErrNum = clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL);
-   	OCL_ERRCK(ciErrNum);
-   	
-   	/*
-   	   char *build_log;
-       size_t ret_val_size;
-       ciErrNum = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK(ciErrNum);
-       build_log = (char *)malloc(ret_val_size+1);
-       ciErrNum = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-       	OCL_ERRCK(ciErrNum);
-
-       // to be carefully, terminate with \0
-       // there's no information in the reference whether the string is 0 terminated or not
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-       */
-
-    mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
-   	OCL_ERRCK(ciErrNum);    
-   	larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
-   	OCL_ERRCK(ciErrNum);
-   	larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
-   	OCL_ERRCK(ciErrNum);
-
-    size_t wgSize;
-    size_t comp_wgSize[3];
-    cl_ulong localMemSize;
-    size_t prefwgSizeMult;
-    cl_ulong privateMemSize;
-
-/*    
-	ciErrNum = clGetKernelWorkGroupInfo(larger_sad_calc_8, NULL, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgSize), &wgSize, NULL);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clGetKernelWorkGroupInfo(larger_sad_calc_8, NULL, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, 3*sizeof(size_t), comp_wgSize, NULL);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clGetKernelWorkGroupInfo(larger_sad_calc_8, NULL, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, NULL);
-	OCL_ERRCK(ciErrNum);
-    ciErrNum = clGetKernelWorkGroupInfo(larger_sad_calc_8, NULL, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &prefwgSizeMult, NULL);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clGetKernelWorkGroupInfo(larger_sad_calc_8, NULL, CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(cl_ulong), &privateMemSize, NULL);
-	OCL_ERRCK(ciErrNum);
-	*/
-/*
-fprintf(stderr, "Work Group Size: %lu\n", wgSize);
-fprintf(stderr, "Compile Work Group Size: %lu, %lu, %lu\n", comp_wgSize[0], comp_wgSize[1], comp_wgSize[2]);
-fprintf(stderr, "Local Memory Size: %lu\n", localMemSize);
-fprintf(stderr, "Preferred Work Group Size Multiple: %lu\n", prefwgSizeMult);
-fprintf(stderr, "Private Memory Size: %lu\n", privateMemSize);	
-*/
- 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY); 
-    
-
-    cl_image_format img_format;
-    img_format.image_channel_order = CL_R;
-    img_format.image_channel_data_type = CL_UNSIGNED_INT16;
-
-/* Transfer reference image to device */
-	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, 
-                                      ref_image->width /** sizeof(unsigned short)*/, // width
-                                      ref_image->height, // height
-                                      ref_image->width * sizeof(unsigned short), // row_pitch
-                                      ref_image->data, &ciErrNum);
-    OCL_ERRCK(ciErrNum);                                      
-	
-    /* Allocate SAD data on the device */
-
-    unsigned short *tmpZero = (unsigned short *)malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short));
-    memset(tmpZero, 0, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short));
-    d_sads = clCreateBuffer(clContext,/* CL_MEM_READ_WRITE |*/ CL_MEM_COPY_HOST_PTR, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), tmpZero, &ciErrNum);
-    OCL_ERRCK(ciErrNum);
-    free(tmpZero);
-    
-    d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_size_bytes, cur_image->data, &ciErrNum);
-    OCL_ERRCK(ciErrNum);
-
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_DRIVER);
-
-
-	/* Set Kernel Parameters */	
-	ciErrNum = clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads);
-	OCL_ERRCK(ciErrNum);
-	
-	ciErrNum = clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image);
-	OCL_ERRCK(ciErrNum);
-	
-	ciErrNum = clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	
-	ciErrNum = clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	ciErrNum = clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks);
-	OCL_ERRCK(ciErrNum);
-	
-	ciErrNum = clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef);
-	OCL_ERRCK(ciErrNum);
-	
-	/*
-	printf("MaxPos: %d\tPos/Thr: %d\tThr(w,h): (%d,%d)\n", MAX_POS, POS_PER_THREAD, THREADS_W, THREADS_H);
-	printf("Local worksize/ThreadsPerBlock: %d\n", CEIL(MAX_POS, POS_PER_THREAD));
-	printf("ref_image->w/h:\t(%u,%u)\nDiv by 4: \t(%u,%u)\nCeil with W/H: \t(%u,%u)\nRemult, Grid: (%u,%u)\n",
-	ref_image->width, ref_image->height,
-	ref_image->width/4, ref_image->height/4,
-	CEIL(ref_image->width / 4, THREADS_W), CEIL(ref_image->height / 4, THREADS_H),
-	CEIL(ref_image->width / 4, THREADS_W) * THREADS_W,  CEIL(ref_image->height / 4, THREADS_H) * THREADS_H);
-	*/
-	
-	size_t mb_sad_calc_localWorkSize[2] = {
-	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
-	    1 };
-	size_t mb_sad_calc_globalWorkSize[2] = {
-        CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H *
-            CEIL(ref_image->width / 4, THREADS_W),
-	    CEIL(ref_image->height / 4, THREADS_H) };
-	
-	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
-	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, image_height_macroblocks * 4};
-	
-	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
-	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32, image_height_macroblocks};
-	
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-/*
-printf("Launching:\n"
-"mb_sad_calc:        Global (%lu,%d) ; Local (%lu,%d)\n"
-"larger_sad_calc_8:  Global (%lu,%lu) ; Local (%lu,%lu)\n"
-"larger_sad_calc_16: Global (%lu,%lu) ; Local (%lu,%lu)\n",
-mb_sad_calc_globalWorkSize[0],1,mb_sad_calc_localWorkSize[0],1,
-larger_sad_calc_8_globalWorkSize[0],larger_sad_calc_8_globalWorkSize[1],
-larger_sad_calc_8_localWorkSize[0],larger_sad_calc_8_localWorkSize[1],
-larger_sad_calc_16_globalWorkSize[0],larger_sad_calc_16_globalWorkSize[1],
-larger_sad_calc_16_localWorkSize[0],larger_sad_calc_16_localWorkSize[1]);
-*/
-    /* Run the 4x4 kernel */	
-	ciErrNum = clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, mb_sad_calc_globalWorkSize, mb_sad_calc_localWorkSize, 0, 0, 0);
-	OCL_ERRCK(ciErrNum);
-		//cuFuncSetSharedSize(mb_sad_calc, SAD_LOC_SIZE_BYTES);
-		
-	/* Run the larger-blocks kernels */
-	ciErrNum = clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, larger_sad_calc_8_localWorkSize, 0, 0, 0);
-	OCL_ERRCK(ciErrNum);
-		
-	ciErrNum = clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, larger_sad_calc_16_globalWorkSize, larger_sad_calc_16_localWorkSize, 0, 0, 0);
-	OCL_ERRCK(ciErrNum);	
-
-    //clFinish(clCommandQueue);
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Transfer SAD data to the host */
-    
-    ciErrNum = clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE, 0, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), sads_computed, 0, NULL, NULL);
-    OCL_ERRCK(ciErrNum);
-
-/*
-ciErrNum = clFinish(clCommandQueue);
-    OCL_ERRCK(ciErrNum);
-
-  print_test_sads(sads_computed, image_size_macroblocks);
-for (int i = 26*MAX_POS_PADDED; i < 26*MAX_POS_PADDED+41; i += 3) {
-printf("Host %2d:%5u %2d:%5u %2d:%5u\n", i, sads_computed[i],i+1, sads_computed[i+1],i+2, sads_computed[i+2]);
-}*/
-
-    /* Free GPU memory */
-    
-
-    ciErrNum = clReleaseKernel(larger_sad_calc_8);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseKernel(larger_sad_calc_16);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseProgram(clProgram);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseCommandQueue(clCommandQueue);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseContext(clContext);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseMemObject(d_sads);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseMemObject(imgRef);
-    OCL_ERRCK(ciErrNum);
-    ciErrNum = clReleaseMemObject(d_cur_image);
-    OCL_ERRCK(ciErrNum);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    
-  }
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile, image_size_macroblocks, sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad.h
deleted file mode 100644
index 3374fa0441..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 8 when allocating memory */
-#define MAX_POS_PADDED 1096
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl
deleted file mode 100644
index 3fca30e769..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.cl
+++ /dev/null
@@ -1,372 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* The compute kernel. */
-/* The macros THREADS_W and THREADS_H specify the width and height of the
- * area to be processed by one thread, measured in 4-by-4 pixel blocks.
- * Larger numbers mean more computation per thread block.
- *
- * The macro POS_PER_THREAD specifies the number of search positions for which
- * an SAD is computed.  A larger value indicates more computation per thread,
- * and fewer threads per thread block.  It must be a multiple of 3 and also
- * must be at most 33 because the loop to copy from shared memory uses
- * 32 threads per 4-by-4 pixel block.
- *
- */
- 
-#define NV_OPENCL 0
- 
-#define TIMES_DIM_POS(x) (((x) << 5) + (x)) 
- 
-/* Macros to access temporary frame storage in shared memory */
-#define FRAME_GET(n, x, y) \
-  (frame_loc[((n) << 4) + ((y) << 2) + (x)])
-#define FRAME_PUT_1(n, x, value) \
-  (frame_loc[((n) << 4) + (x)] = value)
-
-/* Macros to access temporary SAD storage in shared memory */
-#define SAD_LOC_GET(blocknum, pos) \
-  (sad_loc[(blocknum) * MAX_POS_PADDED + (pos)])
-#define SAD_LOC_PUT(blocknum, pos, value) \
-  (sad_loc[(blocknum) * MAX_POS_PADDED + (pos)] = (value))
-
-/* When reading from this array, we use an "index" rather than a
-   search position.  Also, the number of array elements is divided by
-   four relative to SAD_LOC_GET() since this is an array of 8byte
-   data, while SAD_LOC_GET() sees an array of 2byte data. */
-#define SAD_LOC_8B_GET(blocknum, ix) \
-  (sad_loc_8b[(blocknum) * (MAX_POS_PADDED/4) + (ix)])
-
-/* The size of one row of sad_loc_8b.  This is the group of elements
- * holding SADs for all search positions for one 4x4 block. */
-#define SAD_LOC_8B_ROW_SIZE (MAX_POS_PADDED/4)
-
-/* The presence of this preprocessor variable controls which
- * of two means of computing the current search position is used. */
-#define SEARCHPOS_RECURRENCE
-
-__kernel void mb_sad_calc(__global unsigned short *blk_sad,
-			    __global unsigned short *frame,
-			    int mb_width,
-			    int mb_height,
-			    __read_only image2d_t img_ref)
-{
-
-	const sampler_t texSampler =
-	CLK_NORMALIZED_COORDS_FALSE |
-	CLK_ADDRESS_CLAMP_TO_EDGE |
-	CLK_FILTER_NEAREST;
-
-// A local copy of the current 4x4 block 
-    __local unsigned short frame_loc[THREADS_W * THREADS_H * 16];
-    // The local SAD array on the device.  This is an array of short ints.  It is
-    // interpreted as an array of 8-byte data for global data transfers.
-    __local unsigned short sad_loc[SAD_LOC_SIZE_BYTES];
-    __local uint2 *sad_loc_8b = sad_loc;
-
-  int txy_tmp = get_local_id(0) / CEIL_POS;
-  int ty = txy_tmp / THREADS_W;
-  int tx = txy_tmp - mul24(ty, THREADS_W);
-  int bx = get_global_id(0) / get_local_size(0);
-  int by = get_global_id(1) / get_local_size(1);
-
-  // Macroblock and sub-block coordinates 
-  int mb_x = (tx + mul24(bx, THREADS_W)) >> 2;
-  int mb_y = (ty + mul24(by, THREADS_H)) >> 2;
-  int block_x = (tx + mul24(bx, THREADS_W)) & 0x03;
-  int block_y = (ty + mul24(by, THREADS_H)) & 0x03;
-
-  // Block-copy data into shared memory.
-  // Threads are grouped into sets of 16, leaving some threads idle.
-  if ((get_local_id(0) >> 4) < (THREADS_W * THREADS_H))
-  {
-    int ty = (get_local_id(0) >> 4) / THREADS_W;
-    int tx = (get_local_id(0) >> 4) - mul24(ty, THREADS_W);
-    int tgroup = get_local_id(0) & 15;
-
-    // Width of the image in pixels
-    int img_width = mb_width*16;
-
-    // Pixel offset of the origin of the current 4x4 block 
-    int frame_x = (tx + mul24(bx, THREADS_W)) << 2;
-    int frame_y = (ty + mul24(by, THREADS_H)) << 2;
-
-    // Origin in the current frame for this 4x4 block
-    int cur_o = frame_y * img_width + frame_x;
-
-    // If this is an invalid 4x4 block, do nothing
-    if (((frame_x >> 4) < mb_width) && ((frame_y >> 4) < mb_height))
-      {
-	// Copy one pixel into 'frame'
-	FRAME_PUT_1(mul24(ty, THREADS_W) + tx, tgroup,
-		    frame[cur_o + (tgroup >> 2) * img_width + (tgroup & 3)]); 
-      }
-  }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-  //__syncthreads();
-
-  // If this thread is assigned to an invalid 4x4 block, do nothing 
-  if ((mb_x < mb_width) && (mb_y < mb_height))
-    {
-      // Pixel offset of the origin of the current 4x4 block 
-      int frame_x = ((mb_x << 2) + block_x) << 2;
-      int frame_y = ((mb_y << 2) + block_y) << 2;
-
-      // Origin of the search area for this 4x4 block 
-      int ref_x = frame_x - SEARCH_RANGE;
-      int ref_y = frame_y - SEARCH_RANGE;
-
-      // Origin in the current frame for this 4x4 block 
-      int cur_o = ty * THREADS_W + tx;
-
-      int search_pos;
-      int search_pos_base =
-	(get_local_id(0) % CEIL_POS) * POS_PER_THREAD;
-      int search_pos_end = search_pos_base + POS_PER_THREAD;
-
-      int sotmp = search_pos_base / SEARCH_DIMENSION;
-      int local_search_off_x = search_pos_base - TIMES_DIM_POS(sotmp);
-      int search_off_y = ref_y + sotmp;
-
-      // Don't go past bounds 
-      if (search_pos_end > MAX_POS)
-	search_pos_end = MAX_POS;
-
-      // For each search position, within the range allocated to this thread 
-      for (search_pos = search_pos_base;
-	   search_pos < search_pos_end;
-	   search_pos += 3) {
-	// It is also beneficial to fuse (jam) the enclosed loops if this loop
-	// is unrolled. 
-	unsigned short sad1 = 0, sad2 = 0, sad3 = 0;
-	int search_off_x = ref_x + local_search_off_x;
-
-	// 4x4 SAD computation 
-	for(int y=0; y<4; y++) {
-	  int t; // signed int or unsigned short works, but not unsigned int
-	  
-	  t = (read_imageui(img_ref, texSampler, (int2)(search_off_x, search_off_y + y) )).x;
-	  sad1 += abs(t - FRAME_GET(cur_o, 0, y));
-    
-      t = (read_imageui(img_ref, texSampler, (int2)(search_off_x + 1, search_off_y + y) )).x;
-	  sad1 += abs(t - FRAME_GET(cur_o, 1, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 0, y));
-
-      t = (read_imageui(img_ref, texSampler, (int2)(search_off_x + 2, search_off_y + y) )).x;
-	  sad1 += abs(t - FRAME_GET(cur_o, 2, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 1, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 0, y));
-
-      t = (read_imageui(img_ref, texSampler, (int2)(search_off_x + 3, search_off_y + y) )).x;
-	  sad1 += abs(t - FRAME_GET(cur_o, 3, y));
-	  sad2 += abs(t - FRAME_GET(cur_o, 2, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 1, y));
-
-      t = (read_imageui(img_ref, texSampler, (int2)(search_off_x + 4, search_off_y + y) )).x;
-	  sad2 += abs(t - FRAME_GET(cur_o, 3, y));
-	  sad3 += abs(t - FRAME_GET(cur_o, 2, y));
-
-      t = (read_imageui(img_ref, texSampler, (int2)(search_off_x + 5, search_off_y + y) )).x;
-	  sad3 += abs(t - FRAME_GET(cur_o, 3, y));
-	}
-
-	// Save this value into the local SAD array 
-	SAD_LOC_PUT(mul24(ty, THREADS_W) + tx, search_pos, sad1);
-	SAD_LOC_PUT(mul24(ty, THREADS_W) + tx, search_pos+1, sad2);
-	SAD_LOC_PUT(mul24(ty, THREADS_W) + tx, search_pos+2, sad3);
-
-	local_search_off_x += 3;
-	if (local_search_off_x >= SEARCH_DIMENSION)
-	  {
-	    local_search_off_x -= SEARCH_DIMENSION;
-	    search_off_y++;
-	  }
-      }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-//  __syncthreads();
-
-  // Block-copy data into global memory.
-  // Threads are grouped into sets of 32, leaving some threads idle. 
-  if ((get_local_id(0) >> 5) < (THREADS_W * THREADS_H))
-  {
-    int tgroup = get_local_id(0) & 31;
-    int ty = (get_local_id(0) >> 5) / THREADS_W;
-    int tx = (get_local_id(0) >> 5) - mul24(ty, THREADS_W);
-    int index;
-
-    // Macroblock and sub-block coordinates 
-    int mb_x = (tx + mul24(bx, THREADS_W)) >> 2;
-    int mb_y = (ty + mul24(by, THREADS_H)) >> 2;
-    int block_x = (tx + mul24(bx, THREADS_W)) & 0x03;
-    int block_y = (ty + mul24(by, THREADS_H)) & 0x03;
-
-    if ((mb_x < mb_width) && (mb_y < mb_height))
-      {
-	// All SADs from this thread are stored in a contiguous chunk
-	// of memory starting at this offset 
-	blk_sad += (mul24(mul24(mb_width, mb_height), 25) +
-		    (mul24(mb_y, mb_width) + mb_x) * 16 +
-		    (4 * block_y + block_x)) *
-	  MAX_POS_PADDED;
-
-	// Block copy, 32 threads at a time 
-	for (index = tgroup; index < SAD_LOC_8B_ROW_SIZE; index += 32)
-	  ((__global uint2 *)blk_sad)[index] 
-	    = SAD_LOC_8B_GET(mul24(ty, THREADS_W) + tx, index);
-      }
-  }
-  
-}
-
-//typedef unsigned int uint;
-
-__kernel void larger_sad_calc_8(__global unsigned short *blk_sad,
-				  int mb_width,
-				  int mb_height)
-{
-  int tx = get_local_id(1) & 1;
-  int ty = get_local_id(1) >> 1;
-
-  // Macroblock and sub-block coordinates
-  int mb_x = get_global_id(0) / get_local_size(0);
-  int mb_y = get_global_id(1) / get_local_size(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height);
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_6, *bo_5, *bo_4;
-
-  bi = blk_sad    
-    + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 16;
-
-  // Block type 6: 4x8
-  bo_6 = blk_sad
-    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 8;
-
-  if (ty < 100) // always true, but improves register allocation
-    {
-      // Block type 5: 8x4
-      bo_5 = blk_sad
-	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 8;
-
-      // Block type 4: 8x8
-      bo_4 = blk_sad
-	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 4;
-    }
-
-  for (search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      // Each uint is actually two 2-byte integers packed together.
-      // Only addition is used and there is no chance of integer overflow
-      // so this can be done to reduce computation time.
-      
-      #if NV_OPENCL
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_6)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_5)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_4)[search_pos]                  = (i00 + i01) + (i10 + i11);
-      
-      #else
-      // AMD OpenCL will not correctly compile casting to unsigned int
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]);
-      ((__global ushort2 *)bo_6)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_5)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_4)[search_pos]                  = (s00 + s01) + (s10 + s11);
-      #endif
-    }
-    
-}
-
-
-
-__kernel void larger_sad_calc_16(__global unsigned short *blk_sad,
-				   int mb_width,
-				   int mb_height)
-{
-  // Macroblock coordinates 
-  int mb_x = get_global_id(0) / get_local_size(0);
-  int mb_y = get_global_id(1) / get_local_size(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED;
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  int search_pos;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_3, *bo_2, *bo_1;
-
-  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
-  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
-
-  // Block type 3: 8x16
-  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
-  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
-
-  // Block type 5: 8x4
-  bo_2 = blk_sad + macroblocks + macroblock_index * 2;
-
-  // Block type 4: 8x8
-  bo_1 = blk_sad + macroblock_index;
-
-  for (search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-      // Each uint is actually two 2-byte integers packed together.
-      // Only addition is used and there is no chance of integer overflow
-      // so this can be done to reduce computation time.
-      
-      #if NV_OPENCL
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_3)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_2)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_1)[search_pos]                  = (i00 + i01) + (i10 + i11);
-      #else
-      // AMD OpenCL will not correctly compile casting to unsigned int
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]); 
-      ((__global ushort2 *)bo_3)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_2)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_1)[search_pos]                  = (s00 + s01) + (s10 + s11);
-      #endif
-      
-    }
-}
-
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.h b/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.h
deleted file mode 100644
index 4fbf23ef45..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/opencl_nvidia/sad_kernel.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Integer ceiling division.  This computes ceil(x / y) */
-#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
-
-/* Fast multiplication by 33 */
-#define TIMES_DIM_POS(x) (((x) << 5) + (x))
-
-/* Amount of dynamically allocated local storage
- * measured in bytes, 2-byte words, and 8-byte words */
-#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
-#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
-#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
-
-/* The search position index space is distributed across threads
- * and across time. */
-/* This many search positions are calculated by each thread.
- * Note: the optimized kernel requires that this number is
- * divisible by 3. */
-#define POS_PER_THREAD 18
-
-/* The width and height (in number of 4x4 blocks) of a tile from the
- * current frame that is computed in a single thread block. */
-#define THREADS_W 1
-#define THREADS_H 1
-
-// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
-#define TIMES_THREADS_W(x) ((x) * THREADS_W)
-
-/* This structure is used for vector load/store operations. */
-
-struct vec8b {
-  int fst;
-  int snd;
-} __attribute__ ((aligned(8)));
-
-
-
-/* 4-by-4 SAD computation on the device. */
-/*
-extern "C" __global__ void mb_sad_calc(unsigned short*,
-			    unsigned short*,
-			    int, int);
-*/
-/* A function to get a reference to the "ref" texture, because sharing
- * of textures between files isn't really supported. */
- /*
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
-
-extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int);
-extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/Makefile b/hpvm/test/parboil/benchmarks/sad/src/visc/Makefile
deleted file mode 100644
index fbd81c4965..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=file.o image.o OpenCL_common.o main.o
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/file.c b/hpvm/test/parboil/benchmarks/sad/src/visc/file.c
deleted file mode 100644
index 5187c7f7cc..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/file.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include "file.h"
-
-unsigned short
-read16u(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-short
-read16i(FILE *f)
-{
-  int n;
-
-  n = fgetc(f);
-  n += fgetc(f) << 8;
-
-  return n;
-}
-
-void
-write32u(FILE *f, unsigned int i)
-{
-  putc(i, f);
-  putc(i >> 8, f);
-  putc(i >> 16, f);
-  putc(i >> 24, f);
-}
-
-void
-write16u(FILE *f, unsigned short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
-
-void
-write16i(FILE *f, short h)
-{
-  putc(h, f);
-  putc(h >> 8, f);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/file.h b/hpvm/test/parboil/benchmarks/sad/src/visc/file.h
deleted file mode 100644
index 5d783e9134..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/file.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-unsigned short read16u(FILE *f);
-short read16i(FILE *f);
-
-void write32u(FILE *f, unsigned int i);
-void write16u(FILE *f, unsigned short h);
-void write16i(FILE *f, short h);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/image.c b/hpvm/test/parboil/benchmarks/sad/src/visc/image.c
deleted file mode 100644
index d7ed0fcce3..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/image.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "file.h"
-#include "image.h"
-
-struct image_i16 *
-load_image(char *filename)
-{
-  FILE *infile;
-  short *data;
-  int w;
-  int h;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read image contents */
-  data = (short *)malloc(w * h * sizeof(short));
-  fread(data, sizeof(short), w * h, infile);
-
-  fclose(infile);
-
-  /* Create the return data structure */
-  {
-    struct image_i16 *ret =
-      (struct image_i16 *)malloc(sizeof(struct image_i16));
-    ret->width = w;
-    ret->height = h;
-    ret->data = data;
-    return ret;
-  }
-}
-
-void
-free_image(struct image_i16 *img)
-{
-  free(img->data);
-  free(img);
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/image.h b/hpvm/test/parboil/benchmarks/sad/src/visc/image.h
deleted file mode 100644
index 27fc3e0b35..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/image.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-struct image_i16
-{
-  int width;
-  int height;
-  short *data;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct image_i16 * load_image(char *filename);
-void free_image(struct image_i16 *);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/sad/src/visc/main.cpp
deleted file mode 100644
index df898db5aa..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/main.cpp
+++ /dev/null
@@ -1,436 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <inttypes.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "sad.h"
-#include "sad_kernel.h"
-#include "file.h"
-#include "image.h"
-
-static unsigned short *
-load_sads(char *filename);
-static void
-write_sads(char *filename,
-	   int image_width_macroblocks,
-	   int image_height_macroblocks,
-	   unsigned short *sads);
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads);
-
-/* FILE I/O */
-
-unsigned short *
-load_sads(char *filename)
-{
-  FILE *infile;
-  unsigned short *sads;
-  int w;
-  int h;
-  int sads_per_block;
-
-  infile = fopen(filename, "r");
-
-  if (!infile)
-    {
-      fprintf(stderr, "Cannot find file '%s'\n", filename);
-      exit(-1);
-    }
-
-  /* Read image dimensions (measured in macroblocks) */
-  w = read16u(infile);
-  h = read16u(infile);
-
-  /* Read SAD values.  Only interested in the 4x4 SAD values, which are
-   * at the end of the file. */
-  sads_per_block = MAX_POS_PADDED * (w * h);
-  fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
-
-  sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
-  fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
-  fclose(infile);
-
-  return sads;
-}
-
-/* Compare the reference SADs to the expected SADs.
- */
-void
-check_sads(unsigned short *sads_reference,
-	   unsigned short *sads_computed,
-	   int image_size_macroblocks)
-{
-  int block;
-
-  /* Check the 4x4 SAD values.  These are in sads_reference.
-   * Ignore the data at the beginning of sads_computed. */
-  sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
-
-  for (block = 0; block < image_size_macroblocks; block++)
-    {
-      int subblock;
-
-      for (subblock = 0; subblock < 16; subblock++)
-	{
-	  int sad_index;
-
-	  for (sad_index = 0; sad_index < MAX_POS; sad_index++)
-	    {
-	      int index =
-		(block * 16 + subblock) * MAX_POS_PADDED + sad_index;
-
-	      if (sads_reference[index] != sads_computed[index])
-		{
-#if 0
-		  /* Print exactly where the mismatch was seen */
-		  printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
-#else
-		  goto mismatch;
-#endif
-		}
-	    }
-	}
-    }
-
-  printf("Success.\n");
-  return;
-
- mismatch:
-  printf("Computed SADs do not match expected values.\n");
-}
-
-/* Extract the SAD data for a particular block type for a particular
- * macroblock from the array of SADs of that block type. */
-static inline void
-write_subblocks(FILE *outfile, unsigned short *subblock_array, int macroblock,
-		int count)
-{
-  int block;
-  int pos;
-
-  for (block = 0; block < count; block++)
-    {
-      unsigned short *vec = subblock_array +
-	(block + macroblock * count) * MAX_POS_PADDED;
-
-      /* Write all SADs for this sub-block */
-      for (pos = 0; pos < MAX_POS; pos++)
-	write16u(outfile, *vec++);
-    }
-}
-
-/* Write some SAD data to a file for output checking.
- *
- * All SAD values for six rows of macroblocks are written.
- * The six rows consist of the top two, middle two, and bottom two image rows.
- */
-void
-write_sads(char *filename,
-	   int mb_width,
-	   int mb_height,
-	   unsigned short *sads)
-{
-  FILE *outfile = fopen(filename, "w");
-  int mbs = mb_width * mb_height;
-  int row_indir;
-  int row_indices[6] = {0, 1,
-			mb_height / 2 - 1, mb_height / 2,
-			mb_height - 2, mb_height - 1};
-
-  if (outfile == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the number of output macroblocks */
-  write32u(outfile, mb_width * 6);
-
-  /* Write zeros */
-  write32u(outfile, 0);
-
-  /* Each row */
-  for (row_indir = 0; row_indir < 6; row_indir++)
-    {
-      int row = row_indices[row_indir];
-
-      /* Each block in row */
-      int block;
-      for (block = mb_width * row; block < mb_width * (row + 1); block++)
-	{
-	  int blocktype;
-
-	  /* Write SADs for all sub-block types */
-	  for (blocktype = 1; blocktype <= 7; blocktype++)
-	    write_subblocks(outfile,
-			    sads + SAD_TYPE_IX(blocktype, mbs),
-			    block,
-			    SAD_TYPE_CT(blocktype));
-	}
-    }
-
-  fclose(outfile);
-}
-
-/* FILE I/O for debugging */
-
-static void
-write_sads_directly(char *filename,
-		    int width,
-		    int height,
-		    unsigned short *sads)
-{
-  FILE *f = fopen(filename, "w");
-  int n;
-
-  write16u(f, width);
-  write16u(f, height);
-  for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
-    write16u(f, sads[n]);
-  }
-  fclose(f);
-}
-
-static void
-print_test_sad_vector(unsigned short *base, int macroblock, int count)
-{
-  int n;
-  int searchpos = 17*33+17;
-  for (n = 0; n < count; n++)
-    printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
-}
-
-static void
-print_test_sads(unsigned short *sads_computed,
-		int mbs)
-{
-  int macroblock = 5;
-  int blocktype;
-
-  for (blocktype = 1; blocktype <= 7; blocktype++)
-    {
-      printf("%d:", blocktype);
-      print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
-			    macroblock, SAD_TYPE_CT(blocktype));
-      puts("\n");
-    }
-}
-
-/* MAIN */
-
-int
-main(int argc, char **argv)
-{
-  struct image_i16 *ref_image;
-  struct image_i16 *cur_image;
-  unsigned short *sads_computed; /* SADs generated by the program */
-
-  int image_size_bytes;
-  int image_width_macroblocks, image_height_macroblocks;
-  int image_size_macroblocks;
-
-  struct pb_TimerSet timers;
-  struct pb_Parameters *params;
-
-  pb_InitializeTimerSet(&timers);
-  
-  params = pb_ReadParameters(&argc, argv);
-
-  if (pb_Parameters_CountInputs(params) != 2)
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-
-  /* Read input files */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  ref_image = load_image(params->inpFiles[0]);
-  cur_image = load_image(params->inpFiles[1]);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if ((ref_image->width != cur_image->width) ||
-      (ref_image->height != cur_image->height))
-    {
-      fprintf(stderr, "Input images must be the same size\n");
-      exit(-1);
-    }
-  if ((ref_image->width % 16) || (ref_image->height % 16))
-    {
-      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
-      exit(-1);
-    }
-
-  /* Compute parameters, allocate memory */
-  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
-  image_width_macroblocks = ref_image->width >> 4;
-  image_height_macroblocks = ref_image->height >> 4;
-  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
-  
-  sads_computed = (unsigned short *)
-    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
-
-  // Run the kernel code
-  // ************************************************************************
-  	
-    // x : image_width_macroblocks
-    // y : image_height_macroblocks
-  	 
-   /*	
-   char *build_log;
-       size_t ret_val_size;
-       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
-       build_log = (char *)malloc(ret_val_size+1);
-       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );
-
-       // Null terminate (original writer wasn't sure)
-       build_log[ret_val_size] = '\0';
-
-       fprintf(stderr, "%s\n", build_log );
-   */  
-
-    size_t wgSize;
-    size_t comp_wgSize[3];
-    cl_ulong localMemSize;
-    size_t prefwgSizeMult;
-    cl_ulong privateMemSize;
- 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY); 
-    
-    /* Transfer reference image to device */
-/*
-	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, 
-                                      ref_image->width , // width
-                                      ref_image->height, // height
-                                      ref_image->width * sizeof(unsigned short), // row_pitch
-                                      ref_image->data, &ciErrNum);
-*/
-	
-    /* Allocate SAD data on the device */
-
-    unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));
-    size_t tmpZero_size = 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short);
-    
-    /* Compute local and global work sizes */
-	size_t mb_sad_calc_localWorkSize[2] = {
-	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
-	    1 };
-	size_t mb_sad_calc_globalWorkSize[2] = {
-        mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W),
-	    mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H) };
-	
-	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
-	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, 
-	  image_height_macroblocks * 4};
-	
-	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
-	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
-	  image_height_macroblocks * 1};
-	
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    unsigned mb_sad_calcDFG = __visc__node(mb_sad_calc, 2, 2,
-                                           mb_sad_calc_localWorkSize[0],
-                                           mb_sad_calc_localWorkSize[1],
-                                           mb_sad_calc_globalWorkSize[0],
-                                           mb_sad_calc_globalWorkSize[1],
-                                           8,
-                                           /*(unsigned short) */tmpZero, tmpZero_size,
-                                           /*short */cur_image->data, (size_t) image_size_bytes,
-                                           image_width_macroblocks,
-                                           image_height_macroblocks,
-                                           ref_image->data,
-                                           (size_t) (ref_image->width *
-                                                     ref_image->height *
-                                                     sizeof(unsigned short)),
-                                           0);
-    __visc__wait(mb_sad_calcDFG);
-
-    unsigned larger_sad_calc_8DFG = __visc__node(larger_sad_calc_8, 2, 2,
-                                                 larger_sad_calc_8_localWorkSize[0],
-                                                 larger_sad_calc_8_localWorkSize[1],
-                                                 larger_sad_calc_8_globalWorkSize[0],
-                                                 larger_sad_calc_8_globalWorkSize[1],
-                                                 4,
-                                                 /*(unsigned short) */tmpZero, tmpZero_size,
-                                                 image_width_macroblocks,
-                                                 image_height_macroblocks,
-                                                 0);
-    __visc__wait(larger_sad_calc_8DFG);
-
-    unsigned larger_sad_calc_16DFG = __visc__node(larger_sad_calc_16, 2, 2, 
-                                                  larger_sad_calc_16_localWorkSize[0],
-                                                  larger_sad_calc_16_localWorkSize[1],
-                                                  larger_sad_calc_16_globalWorkSize[0],
-                                                  larger_sad_calc_16_globalWorkSize[1],
-                                                  4,
-                                                  /*(unsigned short) */tmpZero, tmpZero_size,
-                                                  image_width_macroblocks,
-                                                  image_height_macroblocks,
-                                                  0);
-    __visc__wait(larger_sad_calc_16DFG);
-	
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Transfer SAD data to the host */    
-/*
-    OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE, 
-        0, 
-        41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), 
-        sads_computed, 0, NULL, NULL) );
-*/
-
-    free(sads_computed);
-    sads_computed = tmpZero;
-
-    /* Free GPU memory */
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    
-  // ************************************************************************
-  // End GPU Code
-
-  /* Print output */
-  if (params->outFile)
-    {
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      write_sads(params->outFile,
-		 image_width_macroblocks,
-		 image_height_macroblocks,
-		 sads_computed);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-#if 0  /* Debugging */
-  print_test_sads(sads_computed, image_size_macroblocks);
-  write_sads_directly("sad-debug.bin",
-		      ref_image->width / 16, ref_image->height / 16,
-		      sads_computed);
-#endif
-
-  /* Free memory */
-  free(sads_computed);
-  free_image(ref_image);
-  free_image(cur_image);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(params);
-    
-  pb_DestroyTimerSet(&timers);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/sad.h b/hpvm/test/parboil/benchmarks/sad/src/visc/sad.h
deleted file mode 100644
index 3374fa0441..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/sad.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Search offsets within 16 pixels of (0,0) */
-#define SEARCH_RANGE 16
-
-/* The total search area is 33 pixels square */
-#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
-
-/* The total number of search positions is 33^2 */
-#define MAX_POS 1089
-
-/* This is padded to a multiple of 8 when allocating memory */
-#define MAX_POS_PADDED 1096
-
-/* VBSME block indices in the SAD array for different 
- * block sizes.  The index is computed from the
- * image size in macroblocks.  Block sizes are (height, width):
- *  1: 16 by 16 pixels, one block per macroblock
- *  2: 8  by 16 pixels, 2  blocks per macroblock
- *  3: 16 by 8  pixels, 2  blocks per macroblock
- *  4: 8  by 8  pixels, 4  blocks per macroblock
- *  5: 4  by 8  pixels, 8  blocks per macroblock
- *  6: 8  by 4  pixels, 8  blocks per macroblock
- *  7: 4  by 4  pixels, 16 blocks per macroblock
- */
-#define SAD_TYPE_1_IX(image_size) 0
-#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
-#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
-#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
-#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
-#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
-#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
-
-#define SAD_TYPE_IX(n, image_size) \
-  ((n == 1) ? SAD_TYPE_1_IX(image_size) : \
-   ((n == 2) ? SAD_TYPE_2_IX(image_size) : \
-    ((n == 3) ? SAD_TYPE_3_IX(image_size) : \
-     ((n == 4) ? SAD_TYPE_4_IX(image_size) : \
-      ((n == 5) ? SAD_TYPE_5_IX(image_size) : \
-       ((n == 6) ? SAD_TYPE_6_IX(image_size) : \
-        (SAD_TYPE_7_IX(image_size) \
-	 )))))))
-
-#define SAD_TYPE_1_CT 1
-#define SAD_TYPE_2_CT 2
-#define SAD_TYPE_3_CT 2
-#define SAD_TYPE_4_CT 4
-#define SAD_TYPE_5_CT 8
-#define SAD_TYPE_6_CT 8
-#define SAD_TYPE_7_CT 16
-
-#define SAD_TYPE_CT(n) \
-  ((n == 1) ? SAD_TYPE_1_CT : \
-   ((n == 2) ? SAD_TYPE_2_CT : \
-    ((n == 3) ? SAD_TYPE_3_CT : \
-     ((n == 4) ? SAD_TYPE_4_CT : \
-      ((n == 5) ? SAD_TYPE_5_CT : \
-       ((n == 6) ? SAD_TYPE_6_CT : \
-        (SAD_TYPE_7_CT \
-	 )))))))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void sad4_cpu(unsigned short *blk_sad,
-	      unsigned short *frame,
-	      unsigned short *ref,
-	      int mb_width,
-	      int mb_height);
-
-void larger_sads(unsigned short *sads,
-		 int mbs);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.cl b/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.cl
deleted file mode 100644
index ab6c5d02d6..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.cl
+++ /dev/null
@@ -1,339 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* The compute kernel. */
-/* The macros THREADS_W and THREADS_H specify the width and height of the
- * area to be processed by one thread, measured in 4-by-4 pixel blocks.
- * Larger numbers mean more computation per thread block.
- *
- * The macro POS_PER_THREAD specifies the number of search positions for which
- * an SAD is computed.  A larger value indicates more computation per thread,
- * and fewer threads per thread block.  It must be a multiple of 3 and also
- * must be at most 33 because the loop to copy from shared memory uses
- * 32 threads per 4-by-4 pixel block.
- *
- */
- 
-// AMD OpenCL fails UINT_CUDA_V
-#if 1
-  #define SHORT2_V 1
-  #define UINT_CUDA_V 0
-#else
-  #define SHORT2_V 0
-  #define UINT_CUDA_V 1
-#endif
-
-// Either works
-#if 0
-  #define VEC_LOAD 1
-  #define CONSTR_LOAD 0
-#else
-  #define VEC_LOAD 0
-  #define CONSTR_LOAD 1
-#endif
-
-// CAST_STORE is only method that works for all implementations of OpenCL tested
-#if 0
-  #define VEC_STORE 1
-  #define CAST_STORE 0
-  #define SCALAR_STORE 0
-#elif 1
-  #define VEC_STORE 0
-  #define CAST_STORE 1
-  #define SCALAR_STORE 0
-#else
-  #define VEC_STORE 0
-  #define CAST_STORE 0
-  #define SCALAR_STORE 1
-#endif
-
-__kernel void mb_sad_calc(__global unsigned short *blk_sad,
-                            __global unsigned short *frame,
-                            int mb_width,
-                            int mb_height,
-                            __read_only image2d_t img_ref)
-{   
-    __visc__attributes(3, blk_sad, frame, img_ref, 1, blk_sad);
-
-	const sampler_t texSampler =
-	CLK_NORMALIZED_COORDS_FALSE |
-	CLK_ADDRESS_CLAMP_TO_EDGE |
-	CLK_FILTER_NEAREST;
-
-
-  int tx = (get_local_id(0) / CEIL_POS) % THREADS_W;
-  int ty = (get_local_id(0) / CEIL_POS) / THREADS_W;
-  int bx = get_group_id(0);
-  int by = get_group_id(1);
-  int img_width = mb_width*16;
-
-  // Macroblock and sub-block coordinates
-  int mb_x = (tx + bx * THREADS_W) >> 2;
-  int mb_y = (ty + by * THREADS_H) >> 2;
-  int block_x = (tx + bx * THREADS_W) & 0x03;
-  int block_y = (ty + by * THREADS_H) & 0x03;
-
-  // If this thread is assigned to an invalid 4x4 block, do nothing 
-  if ((mb_x < mb_width) && (mb_y < mb_height))
-    {
-      // Pixel offset of the origin of the current 4x4 block
-      int frame_x = ((mb_x << 2) + block_x) << 2;
-      int frame_y = ((mb_y << 2) + block_y) << 2;
-
-      // Origin of the search area for this 4x4 block
-      int ref_x = frame_x - SEARCH_RANGE;
-      int ref_y = frame_y - SEARCH_RANGE;
-
-      // Origin in the current frame for this 4x4 block
-      int cur_o = frame_y * img_width + frame_x;
-
-      int search_pos;
-      int search_pos_base =
-        (get_local_id(0) % CEIL_POS) * POS_PER_THREAD;
-      int search_pos_end = search_pos_base + POS_PER_THREAD;
-
-      // All SADs from this thread are stored in a contiguous chunk
-      // of memory starting at this offset
-      blk_sad += mb_width * mb_height * MAX_POS_PADDED * (9 + 16) +
-        (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 +
-        (4 * block_y + block_x) * MAX_POS_PADDED;
-
-      // Don't go past bounds
-      if (search_pos_end > MAX_POS)
-        search_pos_end = MAX_POS;
-
-      // For each search position, within the range allocated to this thread
-      for (search_pos = search_pos_base;
-           search_pos < search_pos_end;
-           search_pos++) {
-        unsigned short sad4x4 = 0;
-        int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION);
-        int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION);
-
-        // 4x4 SAD computation
-        for(int y=0; y<4; y++) {
-          for (int x=0; x<4; x++) {
-          
-          // ([unsigned] short)read_imageui or
-          //                   read_imagei  is required for correct calculation.
-          // Though read_imagei() is shorter, its results are undefined by specification since the input
-          // is an unsigned type, CL_UNSIGNED_INT16
-          
-            sad4x4 += abs((unsigned short)((read_imageui(img_ref, texSampler, (int2)(search_off_x + x, search_off_y + y) )).x) -
-                  frame[cur_o + y * img_width + x]);
-                  
-          }
-        }
-
-        // Save this value into the local SAD array 
-        blk_sad[search_pos] = sad4x4;
-      }
-    }
-
-}
-
-
-//typedef unsigned int uint;
-
-__kernel void larger_sad_calc_8(__global unsigned short *blk_sad,
-				  int mb_width,
-				  int mb_height)
-{
-  __visc__attributes(1, blk_sad, 1, blk_sad);
-
-  int tx = get_local_id(1) & 1;
-  int ty = get_local_id(1) >> 1;
-
-  // Macroblock and sub-block coordinates
-  int mb_x = get_group_id(0);
-  int mb_y = get_group_id(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height);
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_6, *bo_5, *bo_4;
-
-
-  bi = blk_sad    
-    + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 16;
-
-  // Block type 6: 4x8
-  bo_6 = blk_sad
-    + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
-    + macroblock_index * 8;
-
-  if (ty < 100) // always true, but improves register allocation
-    {
-      // Block type 5: 8x4
-      bo_5 = blk_sad
-	+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 8;
-
-      // Block type 4: 8x8
-      bo_4 = blk_sad
-	+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
-	+ macroblock_index * 4;
-    }
-
-  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-#if SHORT2_V
-  #if VEC_LOAD
-      ushort2 s00 = vload2(search_pos,                    bi);
-      ushort2 s01 = vload2(search_pos+  MAX_POS_PADDED/2, bi);
-      ushort2 s10 = vload2(search_pos+4*MAX_POS_PADDED/2, bi);
-      ushort2 s11 = vload2(search_pos+5*MAX_POS_PADDED/2, bi);
-  #else
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]);
-  #endif
-
-  #if VEC_STORE
-      ushort2 s0010 = s00 + s10;
-      ushort2 s0111 = s01 + s11;
-      ushort2 s0001 = s00 + s01;
-      ushort2 s1011 = s10 + s11;
-      ushort2 s00011011 = s0001 + s1011;
-      
-      vstore2(s0010, search_pos, bo_6);
-      vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_6);
-      vstore2(s0001, search_pos, bo_5);
-      vstore2(s1011, search_pos+2*MAX_POS_PADDED/2, bo_5);
-      vstore2(s00011011, search_pos, bo_4);
-  #elif CAST_STORE
-      ((__global ushort2 *)bo_6)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_5)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_4)[search_pos]                  = (s00 + s01) + (s10 + s11);
-  #else // SCALAR_STORE
-      bo_6[search_pos*2] = s00.x + s10.x;
-      bo_6[search_pos*2+1] = s00.y + s10.y;
-      bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
-      bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
-      bo_5[search_pos*2] = s00.x + s01.x;
-      bo_5[search_pos*2+1] = s00.y + s01.y;
-      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x;
-      bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
-      bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
-      bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
-  #endif
-#else // UINT_CUDA_V
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_6)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_5)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_4)[search_pos]                  = (i00 + i01) + (i10 + i11);
-#endif
-    }
-    
-}
-
-
-
-__kernel void larger_sad_calc_16(__global unsigned short *blk_sad,
-				   int mb_width,
-				   int mb_height)
-{
-  __visc__attributes(1, blk_sad, 1, blk_sad);
-
-  // Macroblock coordinates 
-  int mb_x = get_group_id(0);
-  int mb_y = get_group_id(1);
-
-  // Number of macroblocks in a frame
-  int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED;
-  int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
-
-  __global unsigned short *bi;
-  __global unsigned short *bo_3, *bo_2, *bo_1;
-
-  //bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
-  bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
-
-  // Block type 3: 8x16
-  //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
-  bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
-
-  // Block type 5: 8x4
-  bo_2 = blk_sad + macroblocks + macroblock_index * 2;
-
-  // Block type 4: 8x8
-  bo_1 = blk_sad + macroblock_index;
-
-  for (int search_pos = get_local_id(0); search_pos < (MAX_POS+1)/2; search_pos += 32)
-    {
-#if SHORT2_V
-  #if VEC_LOAD
-      ushort2 s00 = vload2(search_pos,                    bi);
-      ushort2 s01 = vload2(search_pos+  MAX_POS_PADDED/2, bi);
-      ushort2 s10 = vload2(search_pos+2*MAX_POS_PADDED/2, bi);
-      ushort2 s11 = vload2(search_pos+3*MAX_POS_PADDED/2, bi);
-  #else
-      ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
-      ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
-      ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]);
-      ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]);
-  #endif
-
-  #if VEC_STORE
-      ushort2 s0010 = s00 + s10;
-      ushort2 s0111 = s01 + s11;
-      ushort2 s0001 = s00 + s01;
-      ushort2 s1011 = s10 + s11;
-      ushort2 s00011011 = s0001 + s1011;
-      
-      vstore2(s0010, search_pos, bo_3);
-      vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_3);
-      vstore2(s0001, search_pos, bo_2);
-      vstore2(s1011, search_pos+MAX_POS_PADDED/2, bo_2);
-      vstore2(s00011011, search_pos, bo_1);
-  #elif CAST_STORE
-      ((__global ushort2 *)bo_3)[search_pos]                  = s00 + s10;
-      ((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
-      ((__global ushort2 *)bo_2)[search_pos]                  = s00 + s01;
-      ((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11;
-      ((__global ushort2 *)bo_1)[search_pos]                  = (s00 + s01) + (s10 + s11);
-  #else // SCALAR_STORE
-      bo_3[search_pos*2] = s00.x + s10.x;
-      bo_3[search_pos*2+1] = s00.y + s10.y;
-      bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
-      bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
-      bo_2[search_pos*2] = s00.x + s01.x;
-      bo_2[search_pos*2+1] = s00.y + s01.y;
-      bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x;
-      bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
-      bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
-      bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
-  #endif
-#else // UINT_CUDA_V
-      uint i00 = ((__global uint *)bi)[search_pos];
-      uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
-      uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
-      uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
-
-      ((__global uint *)bo_3)[search_pos]                  = i00 + i10;
-      ((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
-      ((__global uint *)bo_2)[search_pos]                  = i00 + i01;
-      ((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
-      ((__global uint *)bo_1)[search_pos]                  = (i00 + i01) + (i10 + i11);
-#endif
-    }
-}
-
-
diff --git a/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.h b/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.h
deleted file mode 100644
index 4fbf23ef45..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/src/visc/sad_kernel.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* Integer ceiling division.  This computes ceil(x / y) */
-#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
-
-/* Fast multiplication by 33 */
-#define TIMES_DIM_POS(x) (((x) << 5) + (x))
-
-/* Amount of dynamically allocated local storage
- * measured in bytes, 2-byte words, and 8-byte words */
-#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
-#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
-#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
-
-/* The search position index space is distributed across threads
- * and across time. */
-/* This many search positions are calculated by each thread.
- * Note: the optimized kernel requires that this number is
- * divisible by 3. */
-#define POS_PER_THREAD 18
-
-/* The width and height (in number of 4x4 blocks) of a tile from the
- * current frame that is computed in a single thread block. */
-#define THREADS_W 1
-#define THREADS_H 1
-
-// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
-#define TIMES_THREADS_W(x) ((x) * THREADS_W)
-
-/* This structure is used for vector load/store operations. */
-
-struct vec8b {
-  int fst;
-  int snd;
-} __attribute__ ((aligned(8)));
-
-
-
-/* 4-by-4 SAD computation on the device. */
-/*
-extern "C" __global__ void mb_sad_calc(unsigned short*,
-			    unsigned short*,
-			    int, int);
-*/
-/* A function to get a reference to the "ref" texture, because sharing
- * of textures between files isn't really supported. */
- /*
-texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
-
-extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int);
-extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/
diff --git a/hpvm/test/parboil/benchmarks/sad/tools/compare-output b/hpvm/test/parboil/benchmarks/sad/tools/compare-output
deleted file mode 100755
index ef03bef2e7..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/tools/compare-output
+++ /dev/null
@@ -1,38 +0,0 @@
-#! /usr/bin/env python
-
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-import sys
-sys.path.insert(0, '../../common/python')
-
-import filecompare as fc
-import binaryfilecompare as bfc
-
-# The block type of a sub-block.  See sad.h for a list of block types.
-sub_block_type = [1,
-	2,2,
-	3,3,
-	4,4,4,4,
-	5,5,5,5,5,5,5,5,
-	6,6,6,6,6,6,6,6,
-	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-
-def compare_macroblock(n):
-	def compare_subblocks(blocknum):
-		msg = ("Mismatched SAD in macroblock %d, block type %d\n" %
-			(n, sub_block_type[blocknum]))
-		return fc.Compare(bfc.many_uint16(1089), message=msg)
-
-	return fc.For_(41, compare_subblocks)
-
-def compare_frame_SADs(num_macroblocks):
-	return fc.For_(num_macroblocks, compare_macroblock)
-
-comparison = fc.Then(
-	fc.Bind(fc.Compare(bfc.uint32),
-		lambda n: fc.Then(
-			fc.Compare(bfc.uint32),
-			compare_frame_SADs(n))),
-	fc.Compare(bfc.eof))
-
-fc.default_main(comparison)
diff --git a/hpvm/test/parboil/benchmarks/sad/tools/compute-one-sad.py b/hpvm/test/parboil/benchmarks/sad/tools/compute-one-sad.py
deleted file mode 100755
index 555d262519..0000000000
--- a/hpvm/test/parboil/benchmarks/sad/tools/compute-one-sad.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#! /usr/bin/env python
-
-# (c) Copyright 2007 The Board of Trustees of the University of Illinois.
-
-# This tool computes the 4x4 SADs for the first macroblock in an image
-# for one search offset and prints it.  It's useful primarily for
-# diagnosing incorrect program output.
-
-import binaryfilecompare as bfc
-
-class Img:
-    def __init__(self, (w, h), d):
-        self.width = w
-	self.height = h
-	self.data = d
-
-def load_image(f):
-    width = bfc.uint16(f)
-    height = bfc.uint16(f)
-    lines = [bfc.many_uint16(width)(f) for n in range(height)]
-    return Img((width, height), lines)
-
-frm = load_image(file('input/default/frame.bin'))
-ref = load_image(file('input/default/reference.bin'))
-
-def getpix(img, x, y):
-    if x < 0: x = 0
-    if x >= img.width: x = img.width - 1
-    if y < 0: y = 0
-    if y >= img.height: y = img.height - 1
-    return img.data[y][x]
-    
-def pixdif(x, y, xoff, yoff):
-    return abs(getpix(frm,x,y) - getpix(ref,x+xoff,y+yoff))
-
-# Compute 4x4 SADs for this search offset in the reference frame
-searchoff = (1,1)
-
-print "Search position: %d" % (searchoff[0] + 16 + 33 * (searchoff[1] + 16))
-
-print [[sum([sum([pixdif(4*bx+x, 4*by+y, searchoff[0], searchoff[1])
-                  for x in range(4)])
-             for y in range(4)])
-        for bx in range(4)]
-       for by in range(4)]
diff --git a/hpvm/test/parboil/benchmarks/saxpy_test/Makefile b/hpvm/test/parboil/benchmarks/saxpy_test/Makefile
deleted file mode 100644
index 99aad591f6..0000000000
--- a/hpvm/test/parboil/benchmarks/saxpy_test/Makefile
+++ /dev/null
@@ -1,177 +0,0 @@
-# This Makefile compiles the HPVM-CAVA pilot project. 
-# It builds HPVM-related dependencies, then the native camera pipeline ISP code.
-#
-# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config,
-# which can be copied from Makefile.config.example for a start.
-
-CONFIG_FILE := Makefile.config
-
-ifeq ($(wildcard $(CONFIG_FILE)),)
-    $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example)
-endif
-include $(CONFIG_FILE)
-
-# Compiler Flags
-
-DLEVEL ?= 0
-LFLAGS += -lm -lrt
-
-# Build dirs
-ifeq ($(VERSION),)
-    VERSION = IR_modules
-endif
-SRC_DIR = src/
-CAM_PIPE_SRC_DIR = $(SRC_DIR)
-BUILD_DIR = build/$(TARGET)_$(VERSION)
-CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-
-# Source files for the frontend camera pipeline
-COMMON_SRCS = main.c \
-
-
-#CAM_PIPE_SRCS = load_cam_model.c \
-        cam_pipe_utility.c \
-	dma_interface.c # FIXME: This is a hack until external C++ files can be included in build.
-
-
-# NOTE: We have temporarily removed gem5 and other dependencies for simplicity.
-SRCS = $(COMMON_SRCS)
-
-# NATIVE_FULL_PATH_SRCS contains all the full path source files for the camera vision pipeline.
-NATIVE_FULL_PATH_SRCS = $(patsubst %, $(SRC_DIR)/%, $(COMMON_SRCS))
-NATIVE_FULL_PATH_SRCS += $(patsubst %, $(CAM_PIPE_SRC_DIR)/%, $(CAM_PIPE_SRCS))
-
-INCLUDES +=  -I$(SRC_DIR) \
-            -I$(CAM_PIPE_SRC_DIR) \
-
-ifneq ($(CONFUSE_ROOT),)
-INCLUDES += -I$(CONFUSE_ROOT)/include
-LFLAGS += -L$(CONFUSE_ROOT)/lib
-endif
-
-EXE = vec_add
-
-CAM_CFLAGS += -mf16c -flax-vector-conversions
-LFLAGS += -pthread
-
-
-## BEGIN HPVM MAKEFILE
-LANGUAGE=visc
-#SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll
-#OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c
-VISC_OBJS=main.visc.ll
-APP = $(EXE)
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
-APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include
-APP_LDFLAGS=$(LFLAGS)
-OPT_FLAGS = -tti -targetlibinfo -tbaa -scoped-noalias -assumption-cache-tracker -profile-summary-info -forceattrs -inferattrs -ipsccp -globalopt -domtree -mem2reg -deadargelim -domtree -basicaa -aa -simplifycfg -pgo-icall-prom -basiccg -globals-aa -prune-eh -always-inline -functionattrs -domtree -sroa -early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -domtree -basicaa -aa -libcalls-shrinkwrap -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -licm -loop-unswitch -simplifycfg -domtree -basicaa -aa -loops -loop-simplify -lcssa-verification -lcssa -scalar-evolution -indvars -loop-idiom -loop-deletion -memdep -memcpyopt -sccp -domtree -demanded-bits -bdce -basicaa -aa -lazy-value-info -jump-threading -correlated-propagation -domtree -basicaa -aa -memdep -dse -loops -loop-simplify -lcssa-verification -lcssa -aa -scalar-evolution -licm -postdomtree -adce -simplifycfg -domtree -basicaa -aa -barrier -basiccg -rpo-functionattrs -globals-aa -float2int -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -loop-accesses -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-distribute -loop-simplify -lcssa-verification -lcssa -branch-prob -block-freq -scalar-evolution -basicaa -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize -loop-simplify -scalar-evolution -aa -loop-accesses -loop-load-elim -basicaa -aa -simplifycfg -domtree -basicaa -aa -loops -scalar-evolution -alignment-from-assumptions -strip-dead-prototypes -domtree -loops -branch-prob -block-freq -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -branch-prob -block-freq -loop-sink -instsimplify 
-
-CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
-LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
-
-LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs
-#VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-rt
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
-
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
-LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc
-
-LLVM_34_AS = $(LLVM_34_ROOT)/build/bin/llvm-as
-
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
-KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
-
-
-DEVICE = CPU_TARGET
-VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-VISC_OPTFLAGS += -visc-timers-x86
-TESTGEN_OPTFLAGS += -visc-timers-gen
-
-CFLAGS += -DDEVICE=$(DEVICE)
-CXXFLAGS += -DDEVICE=$(DEVICE)
-
-
-# Add BUILDDIR as a prefix to each element of $1
-INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
-
-PYTHON_LLVM_40_34 = ../llvm-40-34.py
-
-.PRECIOUS: $(BUILD_DIR)/%.ll
-
-OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
-KERNEL = $(TEST_OBJS).kernels.ll
-
-ifeq ($(TARGET),x86)
-  SPIR_ASSEMBLY = $(TEST_OBJS).kernels.bc
-else ifeq ($(TARGET),seq)
-else ifeq ($(TARGET),fpga)
-  AOC_CL = $(TEST_OBJS).kernels.cl
-  AOCL_ASSEMBLY = $(TEST_OBJS).kernels.aocx
-  BOARD = a10gx
-  ifeq ($(EMULATION),1)
-    EXE = cava-visc-emu
-    AOC_EMU = -march=emulator
-    BUILD_DIR = build/$(TARGET)-emu
-  endif
-else
-  KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
-  PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s
-endif
-
-HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
-HOST = $(BUILD_DIR)/$(APP).host.ll
-
-ifeq ($(OPENCL_PATH),)
-FAILSAFE=no_opencl
-else 
-FAILSAFE=
-endif
-
-# Targets
-default: $(FAILSAFE) $(BUILD_DIR) $(EXE)
-#default: $(FAILSAFE) $(BUILD_DIR) $(PTX_ASSEMBLY) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
-
-$(PTX_ASSEMBLY) : $(KERNEL_LINKED)
-	$(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@
-
-$(KERNEL_LINKED) : $(KERNEL)
-	$(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@
-
-$(SPIR_ASSEMBLY) : $(KERNEL)
-	python $(PYTHON_LLVM_40_34) $< $(BUILD_DIR)/kernel_34.ll
-	$(LLVM_34_AS) $(BUILD_DIR)/kernel_34.ll -o $@
-
-$(AOCL_ASSEMBLY) : $(AOC_CL)
-	aoc --report $(AOC_EMU) $(AOC_CL) -o $(AOCL_ASSEMBLY) -board=$(BOARD)
-
-$(AOC_CL) : $(KERNEL)
-	llvm-cbe --debug $(KERNEL)
-
-$(EXE) : $(HOST_LINKED)
-	$(CXX) -O3 $(LDFLAGS) $< -o $@
-
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
-	$(LLVM_LINK) $^ -S -o $@
-
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
-	make -C $(LLVM_LIB_PATH)
-
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
-
-$(BUILD_DIR):
-	mkdir -p $(BUILD_DIR)
-
-$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
-	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
-	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
-
diff --git a/hpvm/test/parboil/benchmarks/saxpy_test/src/defs.h b/hpvm/test/parboil/benchmarks/saxpy_test/src/defs.h
deleted file mode 100644
index ccc8acc857..0000000000
--- a/hpvm/test/parboil/benchmarks/saxpy_test/src/defs.h
+++ /dev/null
@@ -1,224 +0,0 @@
-#ifndef _COMMON_DEFS_H_
-#define _COMMON_DEFS_H_
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long uint64_t;
-
-#define CACHELINE_SIZE 64
-
-// Debugging message macros.
-#if DEBUG_LEVEL >= 1
-  #define INFO_MSG(args...) printf(args)
-
-  #if DEBUG_LEVEL >= 2
-    #define PRINT_MSG(args...) printf(args)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
-        print_debug(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
-        print_debug4d(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
-        print_debug4d_fp16(hid, num, height, rows, cols)
-
-    #if DEBUG_LEVEL >= 3
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
-          print_debug(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
-          print_debug4d(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...) printf(args)
-    #else
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...)
-    #endif
-  #else
-    #define PRINT_MSG(args...)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-    #define PRINT_DEBUG_V(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-    #define PRINT_MSG_V(args...)
-  #endif
-#else
-  #define INFO_MSG(args...)
-  #define PRINT_DEBUG(hid, rows, cols, num_cols)
-  #define PRINT_DEBUG4D(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-  #define PRINT_MSG(args...)
-  #define PRINT_DEBUG_V(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-  #define PRINT_MSG_V(args...)
-#endif
-
-#define STRING(arg) #arg
-
-// This is to avoid a ton of spurious unused variable warnings when
-// we're not building for gem5.
-#define UNUSED(x) (void)(x)
-
-// Macros for computing the maximum of a group of elements.
-//
-// Why macros and not functions (or a loop)? A loop takes O(n) cycles to
-// compute the maximum, when it could be done in O(log n) time with a tree
-// based implementation. But Aladdin regards function calls as a hard
-// dependency that it does not optimize across, so we would not get the
-// parallelism we expect from the tree. Thus, these must be macros.
-//
-// I've only implemented a few of these. These are only meant for the pooling
-// layers, and we shouldn't need more than a 3x3 pooling layer anyways.
-#define max2(A, B) (((A) > (B)) ? (A) : (B))
-#define max3(e0, e1, e2) max2(max2(e0, e1), e2)
-#define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
-#define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
-    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
-#define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
-    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
-
-#define min2(A, B) (((A) < (B)) ? (A) : (B))
-
-#define FRAC_CEIL(A, B) ((A) / (B) + ((A) % (B) != 0))
-// Convenience macros to switch between invoking an accelerator (if building a
-// binary for gem5) or just calling the kernel function in software.
-//
-// Usage:
-//
-//  These macros expand differently based on whether the GEM5_HARNESS macro is
-//  defined. If so, then this binary is meant to be run under gem5, invoking
-//  accelerators; if not, this binary should run the pure software version of
-//  the accelerated kernels.
-//
-//  If GEM5_HARNESS is defined:
-//
-//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
-//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
-//
-//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
-//        ===>   invokeAcceleratorAndBlock(myReqCode)
-//
-//  Otherwise:
-//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
-//        expands to nothing
-//
-//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
-//        ===>  kernelFuncName(args)
-//
-#ifdef GEM5_HARNESS
-
-#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    mapArrayToAccelerator(req_code, name, base_addr, size)
-#define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndBlock(req_code);                                   \
-    } while (0)
-#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
-    } while (0)
-
-#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
-    } while (0)
-#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
-    } while (0)
-#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
-    } while (0)
-#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
-    } while (0)
-
-#else
-
-#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    do {                                                                       \
-        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
-                 name, (void*)base_addr, (int)(size));                         \
-        UNUSED(req_code);                                                      \
-        UNUSED(name);                                                          \
-        UNUSED(base_addr);                                                     \
-        UNUSED(size);                                                          \
-    } while (0)
-#define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
-#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    kernel_ptr(args)
-#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
-#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
-#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
-#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
-
-#endif
-
-// Simplified version of MAP_ARRAY_TO_ACCEL.
-//
-// This assumes that the current name of the base pointer is also the name of
-// the array in the top level function of the dynamic trace. THIS IS VERY
-// IMPORTANT - if the argument passed to a top level function has been renamed in
-// the function, then this WILL NOT WORK!
-//
-// MAP_ARRAY(myReqCode, myArray, mySize)
-//    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
-#define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
-    MAP_ARRAY_TO_ACCEL(                                                        \
-            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
-
-// Use these convenience macros to cast a raw pointer into a multidimensional
-// variable-length array, which lets us use [] notation inside of the ugly
-// sub2ind syntax!
-//
-// Usage:
-//   If we have an array like array[5][4]:
-//      ARRAY_2D(TYPE, output_name, array, 4);
-//
-//   If we have an array like array[5][4][3]:
-//      ARRAY_3D(TYPE, output_name, array, 4, 3);
-//
-//   If we have an array like array[5][4][3][2]
-//      ARRAY_4D(TYPE, output_name, array, 4, 3, 2);
-//
-//   And so on...
-#define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
-    TYPE* output_array_name = (TYPE*)input_array_name
-
-#define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
-    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
-
-#define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
-    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
-        (TYPE(*)[DIM_1][DIM_2])input_array_name
-
-#define ARRAY_4D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
-
-#define ARRAY_5D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/saxpy_test/src/main.c b/hpvm/test/parboil/benchmarks/saxpy_test/src/main.c
deleted file mode 100644
index ef4e620be0..0000000000
--- a/hpvm/test/parboil/benchmarks/saxpy_test/src/main.c
+++ /dev/null
@@ -1,135 +0,0 @@
-
-#include <stdlib.h>
-//#include "utility.h"
-#include "visc.h"
-#include "defs.h"
-
-
-typedef struct __attribute__((__packed__)) {
-    float* input; size_t bytes_input;
-    float* result; size_t bytes_result;
-} 
-RootIn;
-
-
-
-void scale_values(float* input, size_t num_elems) {
-
-  __visc__hint(DEVICE);
-  __visc__attributes(1, input, 1, input);
-  
-  for (int ind = 0; ind < num_elems; ind++){
-    input[ind] = input[ind] * 2.0;
-  }
-  
-  __visc__return(1, num_elems);
-}
-
-
-
-
-void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, 
-               /*2*/ float* result, /*3*/ size_t bytes_result) {
-
-  //Specifies compilation target for current node
-  __visc__hint(CPU_TARGET);
-
-  __visc__attributes(2, input, result, 2, input, result);
-
-  // Create an 0D (specified by 1st argument) HPVM node - so a single node
-  // associated with node function ---_fxp_wrapper
-
-  void* scaleNode = __visc__createNodeND(0, scale_values);
-    
-  // BindIn binds inputs of current node with specified node
-  // - destination node
-  // - argument position in argument list of function of source node
-  // - argument position in argument list of function of destination node
-  // - streaming (1) or non-streaming (0)
-
-  // Edge transfers data between nodes within the same level of hierarchy.
-  // - source and destination dataflow nodes
-  // - edge type, all-all (1) or one-one(0)
-  // - source position (in output struct of source node)
-  // - destination position (in argument list of destination node)
-  // - streaming (1) or non-streaming (0)
-
-  // scale_fxp inputs
-  __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input
-  __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
-    
-  // Similar to bindIn, but for the output. Output of a node is a struct, and
-  // we consider the fields in increasing ordering.
-  __visc__bindOut(scaleNode, 0, 0, 0);
-    
-}
-
-
-
-
-
-int main(int argc, char* argv[]) {
-
-    size_t input_size = 100;
-    size_t result_size = 100;
-
-    size_t input_bytes = input_size * sizeof(float);
-    size_t result_bytes = result_size * sizeof(float);
-
-    // This is host_input in cam_pipe()
-    float* input = (float*) malloc(input_bytes);
-    for(unsigned int i = 0; i < input_size; i++){
-      input[i] = 1.0;
-    }
-    // This is host_result in cam_pipe()
-    float* result = (float*) malloc(result_bytes);
-
-   
-    __visc__init();
-
-    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
-
-    // Set up HPVM DFG inputs in the rootArgs struct.
-    rootArgs->input = input;
-    rootArgs->bytes_input = input_size;
-
-    printf("input = %d input_bytes = %d \n", input, input_bytes);
-    
-    rootArgs->result = result;
-    rootArgs->bytes_result = result_size;
-
-  
-    llvm_visc_track_mem(input, input_bytes);
-    llvm_visc_track_mem(result, result_bytes);
-
-
-    void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs);
-    __visc__wait(testDFG);
-
-
-    printf("input = %d \n", input);
-    
-    llvm_visc_request_mem(input, input_bytes);
-    //llvm_visc_request_mem(result, result_bytes);
-
-    printf("requested mem \n");
-
-    for(unsigned int i = 0; i < input_size; i++){
-      printf("input[%d] = %f \n", i, input[i]);
-    }
-
-    //llvm_visc_untrack_mem(input);
-    //llvm_visc_untrack_mem(result);
-
-    printf ("untracked mem \n");
-    
-    __visc__cleanup();
-
-    printf ("cleaned up visc");
-
-    return 0;
-}
-
-
-
-    
diff --git a/hpvm/test/parboil/benchmarks/saxpy_test/src/visc.h b/hpvm/test/parboil/benchmarks/saxpy_test/src/visc.h
deleted file mode 100644
index a263e35252..0000000000
--- a/hpvm/test/parboil/benchmarks/saxpy_test/src/visc.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef DEVICE
-#define DEVICE GPU_TARGET
-#endif
-
-#include "llvm/SupportVISC/VISCHint.h"
-
-#ifdef __cplusplus
-extern "C" {
-void __visc__hint(visc::Target);
-//void __visc__wait(void*);
-#else
-void __visc__hint(enum Target);
-//void __visc__wait(unsigned);
-#endif
-
-#ifdef __cplusplus
-//void* __visc__node(...);
-//void* __visc__createNode(...);
-//void* __visc__createNode1D(...);
-//void* __visc__createNode2D(...);
-//void* __visc__createNode3D(...);
-//void __visc__return(...);
-#endif
-
-void* __visc__createNodeND(unsigned,...);
-void __visc__return(unsigned, ...);
-
-void __visc__attributes(unsigned, ...);
-void __visc__init();
-void __visc__cleanup();
-
-void __visc__bindIn(void*, unsigned, unsigned, unsigned);
-void __visc__bindOut(void*, unsigned, unsigned, unsigned);
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
-void __visc__push(void*, void*);
-void* __visc__pop(void*);
-void* __visc__launch(unsigned, ...);
-void __visc__wait(void*);
-
-void* __visc__getNode();
-void* __visc__getParentNode(void*);
-void __visc__barrier();
-void* __visc__malloc(long);
-long __visc__getNodeInstanceID_x(void*);
-long __visc__getNodeInstanceID_y(void*);
-long __visc__getNodeInstanceID_z(void*);
-long __visc__getNumNodeInstances_x(void*);
-long __visc__getNumNodeInstances_y(void*);
-long __visc__getNumNodeInstances_z(void*);
-
-// Atomic
-// signed int
-int __visc__atomic_cmpxchg(int*, int, int);
-int __visc__atomic_add(int*, int);
-int __visc__atomic_sub(int*, int);
-int __visc__atomic_xchg(int*, int);
-int __visc__atomic_inc(int*);
-int __visc__atomic_dec(int*);
-int __visc__atomic_min(int*, int);
-int __visc__atomic_max(int*, int);
-int __visc__atomic_umax(int*, int);
-int __visc__atomic_umin(int*, int);
-int __visc__atomic_and(int*, int);
-int __visc__atomic_or(int*, int);
-int __visc__atomic_xor(int*, int);
-
-// Special Func
-float __visc__floor(float);
-float __visc__rsqrt(float);
-float __visc__sqrt(float);
-float __visc__sin(float);
-float __visc__cos(float);
-// unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
-//unsigned __visc__atomic_add(unsigned*, unsigned);
-//unsigned __visc__atomic_sub(unsigned*, unsigned);
-//unsigned __visc__atomic_xchg(unsigned*, unsigned);
-//unsigned __visc__atomic_inc(unsigned*);
-//unsigned __visc__atomic_dec(unsigned*);
-//unsigned __visc__atomic_min(unsigned*, unsigned);
-//unsigned __visc__atomic_max(unsigned*, unsigned);
-//unsigned __visc__atomic_and(unsigned*, unsigned);
-//unsigned __visc__atomic_or(unsigned*, unsigned);
-//unsigned __visc__atomic_xor(unsigned*, unsigned);
-
-
-#include <unistd.h>
-
-long get_global_id(int);
-long get_group_id(int);
-long get_local_id(int);
-long get_local_size(int);
-
-
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void llvm_visc_request_mem(void*, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-- 
GitLab