Removed unused versions of benchmarks

ec3e7deb · Yifan Zhao · 20a3c17f · 20a3c17f · 20a3c17f · 20a3c17f
Commit ec3e7deb authored 5 years ago by Yifan Zhao
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
-{
-    // Partial results 
-    float c[TILE_N];
-    for (int i=0; i < TILE_N; i++)
-	c[i] = 0.0f;
-   
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
-
-    int b_base = 0;
-
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a; 
-        b_base = get_group_id(1) * TILE_N + i * ldb;
-
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    for (int kk = 0; kk < TILE_N; kk++)
-		c[kk] += a * B[b_base + j * ldb + kk];
-
-	}
-    }
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
-    }
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <hpvm.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N * TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << " Error!\n";                                  \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
-               float alpha, float beta) {
-  __hpvm__hint(hpvm::GPU_TARGET);
-  __hpvm__attributes(3, A, B, C, 1, C);
-
-  float c[TILE_N];
-  for (int i = 0; i < TILE_N; i++)
-    c[i] = 0.0f;
-
-  int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-  int m = get_group_id(0) * TILE_M + mid;
-
-  int b_base = 0;
-
-  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
-    float a;
-    b_base = get_group_id(1) * TILE_N + i * ldb;
-
-    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-      a = A[m + (i + j) * lda];
-      for (int kk = 0; kk < TILE_N; kk++)
-        c[kk] += a * B[b_base + j * ldb + kk];
-    }
-  }
-  int t = ldc * get_group_id(1) * TILE_N + m;
-  for (int i = 0; i < TILE_N; i++) {
-    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
-  }
-}
-
-__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
-                                          int n, int k, float alpha, float *A,
-                                          size_t bytesA, int lda, float *B,
-                                          size_t bytesB, int ldb, float beta,
-                                          float *C, size_t bytesC, int ldc) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_M) || (n % TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
-    return;
-  }
-
-  unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
-  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N};
-
-  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
-                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
-                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __hpvm__wait(sgemmDFG);
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_InitializeTimerSet(&timers);
-  __hpvm__init();
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  llvm_hpvm_track_mem(&matA.front(), A_sz);
-  llvm_hpvm_track_mem(&matBT.front(), B_sz);
-  llvm_hpvm_track_mem(&matC.front(), C_sz);
-  // Copy A and B^T into device memory
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (size_t i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
-             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
-             matArow);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_hpvm_request_mem(&matC.front(), C_sz);
-
-  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
-  llvm_hpvm_untrack_mem(&matA.front());
-  llvm_hpvm_untrack_mem(&matBT.front());
-  llvm_hpvm_untrack_mem(&matC.front());
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  pb_PrintTimerSet(&timers);
-  __hpvm__cleanup();
-
-  if (params->outFile) {
-
-    /* Write C to file */
-    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-APP_OPTFLAGS=-unroll-threshold=300 -loop-unroll -sroa
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <hpvm.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_N 16
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N * TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << " Error!\n";                                  \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-typedef struct __attribute__((__packed__)) {
-  float *A;
-  size_t bytesA;
-  int lda;
-  float *B;
-  size_t bytesB;
-  int ldb;
-  float *C;
-  size_t bytesC;
-  int ldc;
-  int k;
-  float alpha;
-  float beta;
-  long block_x;
-  long block_y;
-  long grid_x;
-  long grid_y;
-} RootIn;
-
-void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B,
-              size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k,
-              float alpha, float beta, long block_x, long block_y, long grid_x,
-              long grid_y) {
-  args->A = A;
-  args->bytesA = bytesA;
-  args->lda = lda;
-  args->B = B;
-  args->bytesB = bytesB;
-  args->ldb = ldb;
-  args->C = C;
-  args->bytesC = bytesC;
-  args->ldc = ldc;
-  args->k = k;
-  args->alpha = alpha;
-  args->beta = beta;
-  args->block_x = block_x;
-  args->block_y = block_y;
-  args->grid_x = grid_x;
-  args->grid_y = grid_y;
-}
-
-void Allocation(long block_x, long block_y) {
-  void *shB = __hpvm__malloc(block_x * block_y * sizeof(float));
-  __hpvm__return(2, shB, block_x * block_y * sizeof(float));
-}
-
-void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
-               int ldb, float *C, size_t bytesC, int ldc, int k, float alpha,
-               float beta, float *shB, size_t bytesshB) {
-  __hpvm__hint(hpvm::DEVICE);
-  //__hpvm__hint(hpvm::SPIR_TARGET);
-  //__hpvm__hint(hpvm::GPU_TARGET);
-
-  __hpvm__attributes(3, A, B, C, 1, C);
-
-  void *thisNode = __hpvm__getNode();
-  void *parentNode = __hpvm__getParentNode(thisNode);
-
-  long lx = __hpvm__getNodeInstanceID_x(thisNode);
-  long ly = __hpvm__getNodeInstanceID_y(thisNode);
-
-  long gx = __hpvm__getNodeInstanceID_x(parentNode);
-  long gy = __hpvm__getNodeInstanceID_y(parentNode);
-
-  long dimx = __hpvm__getNumNodeInstances_x(thisNode);
-
-  float c[TILE_N];
-  for (int i = 0; i < TILE_N; i++)
-    c[i] = 0.0f;
-
-  int mid = ly * dimx + lx;
-  int m = gx * TILE_M + mid;
-  int n = gy * TILE_N + lx;
-
-  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
-    float a;
-    // shB[ly][lx] = B[n+(i+ly)*ldb];
-    shB[ly * dimx + lx] = B[n + (i + ly) * ldb];
-
-    __hpvm__barrier();
-    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-      a = A[m + (i + j) * lda];
-      for (int kk = 0; kk < TILE_N; kk++) {
-        // c[kk] += a * shB[j][kk];
-        c[kk] += a * shB[j * dimx + kk];
-      }
-    }
-    __hpvm__barrier();
-  }
-
-  int t = ldc * gy * TILE_N + m;
-  for (int i = 0; i < TILE_N; i++) {
-    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
-  }
-}
-
-// Work group node for sgemm - Creates allocation node and leaf (work item) node
-void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb,
-             float *C, size_t bytesC, int ldc, int k, float alpha, float beta,
-             long block_x, long block_y) {
-  __hpvm__hint(hpvm::CPU_TARGET);
-  __hpvm__attributes(3, A, B, C, 1, C);
-  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
-  void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y);
-
-  // Bind edges
-  __hpvm__bindIn(SgemmLeafNode, 0, 0, 0);   // Bind A
-  __hpvm__bindIn(SgemmLeafNode, 1, 1, 0);   // Bind bytesA
-  __hpvm__bindIn(SgemmLeafNode, 2, 2, 0);   // Bind lda
-  __hpvm__bindIn(SgemmLeafNode, 3, 3, 0);   // Bind B
-  __hpvm__bindIn(SgemmLeafNode, 4, 4, 0);   // Bind bytesB
-  __hpvm__bindIn(SgemmLeafNode, 5, 5, 0);   // Bind ldb
-  __hpvm__bindIn(SgemmLeafNode, 6, 6, 0);   // Bind C
-  __hpvm__bindIn(SgemmLeafNode, 7, 7, 0);   // Bind bytesC
-  __hpvm__bindIn(SgemmLeafNode, 8, 8, 0);   // Bind ldc
-  __hpvm__bindIn(SgemmLeafNode, 9, 9, 0);   // Bind k
-  __hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
-  __hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
-
-  __hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
-  __hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
-
-  // Create Edges between AllocationNode and BFSLeafNodeNode
-  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
-  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
-               0); // Edge bytes_local_B
-}
-
-// Root node for sgemm - Creates work group node
-void SgemmRoot(float *A, size_t bytesA, int lda,                    // 0-2
-               float *B, size_t bytesB, int ldb,                    // 3-5
-               float *C, size_t bytesC, int ldc,                    // 6-8
-               int k, float alpha, float beta,                      // 9-11
-               long block_x, long block_y, long grid_x, long grid_y // 12-15
-) {
-  __hpvm__hint(hpvm::CPU_TARGET);
-  __hpvm__attributes(3, A, B, C, 1, C);
-  void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y);
-
-  // Bind edges
-  __hpvm__bindIn(SgemmTBNode, 0, 0, 0);   // Bind A
-  __hpvm__bindIn(SgemmTBNode, 1, 1, 0);   // Bind bytesA
-  __hpvm__bindIn(SgemmTBNode, 2, 2, 0);   // Bind lda
-  __hpvm__bindIn(SgemmTBNode, 3, 3, 0);   // Bind B
-  __hpvm__bindIn(SgemmTBNode, 4, 4, 0);   // Bind bytesB
-  __hpvm__bindIn(SgemmTBNode, 5, 5, 0);   // Bind ldb
-  __hpvm__bindIn(SgemmTBNode, 6, 6, 0);   // Bind C
-  __hpvm__bindIn(SgemmTBNode, 7, 7, 0);   // Bind bytesC
-  __hpvm__bindIn(SgemmTBNode, 8, 8, 0);   // Bind ldc
-  __hpvm__bindIn(SgemmTBNode, 9, 9, 0);   // Bind k
-  __hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
-  __hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
-  __hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
-  __hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
-}
-
-void SgemmWrapper(float *A, size_t bytesA, int lda,                    // 0-2
-                  float *B, size_t bytesB, int ldb,                    // 3-5
-                  float *C, size_t bytesC, int ldc,                    // 6-8
-                  int k, float alpha, float beta,                      // 9-11
-                  long block_x, long block_y, long grid_x, long grid_y // 12-15
-) {
-  __hpvm__hint(hpvm::CPU_TARGET);
-  __hpvm__attributes(3, A, B, C, 1, C);
-  void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot);
-
-  // Bind edges
-  __hpvm__bindIn(SgemmRootNode, 0, 0, 0);   // Bind A
-  __hpvm__bindIn(SgemmRootNode, 1, 1, 0);   // Bind bytesA
-  __hpvm__bindIn(SgemmRootNode, 2, 2, 0);   // Bind lda
-  __hpvm__bindIn(SgemmRootNode, 3, 3, 0);   // Bind B
-  __hpvm__bindIn(SgemmRootNode, 4, 4, 0);   // Bind bytesB
-  __hpvm__bindIn(SgemmRootNode, 5, 5, 0);   // Bind ldb
-  __hpvm__bindIn(SgemmRootNode, 6, 6, 0);   // Bind C
-  __hpvm__bindIn(SgemmRootNode, 7, 7, 0);   // Bind bytesC
-  __hpvm__bindIn(SgemmRootNode, 8, 8, 0);   // Bind ldc
-  __hpvm__bindIn(SgemmRootNode, 9, 9, 0);   // Bind k
-  __hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
-  __hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
-  __hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
-  __hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
-  __hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
-  __hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
-}
-
-// Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers,
-                                          char transa, char transb, int m,
-                                          int n, int k, float alpha, float *A,
-                                          size_t bytesA, int lda, float *B,
-                                          size_t bytesB, int ldb, float beta,
-                                          float *C, size_t bytesC, int ldc) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_M) || (n % TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
-    return;
-  }
-
-  //    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-
-  long block_x = TILE_N;
-  long block_y = TILE_TB_HEIGHT;
-  long grid_x = m / TILE_M;
-  long grid_y = n / TILE_N;
-
-  // Pack data in struct
-  RootIn *args = (RootIn *)malloc(sizeof(RootIn));
-  packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta,
-           block_x, block_y, grid_x, grid_y);
-
-  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
-  void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args);
-
-  __hpvm__wait(sgemmDFG);
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_InitializeTimerSet(&timers);
-  __hpvm__init();
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  llvm_hpvm_track_mem(&matA.front(), A_sz);
-  llvm_hpvm_track_mem(&matBT.front(), B_sz);
-  llvm_hpvm_track_mem(&matC.front(), C_sz);
-  // Copy A and B^T into device memory
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (size_t i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  // Use standard sgemm interface
-  basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(),
-             A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(),
-             C_sz, matArow);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_hpvm_request_mem(&matC.front(), C_sz);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  llvm_hpvm_untrack_mem(&matA.front());
-  llvm_hpvm_untrack_mem(&matBT.front());
-  llvm_hpvm_untrack_mem(&matC.front());
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  pb_PrintTimerSet(&timers);
-  __hpvm__cleanup();
-
-  if (params->outFile) {
-    /* Write C to file */
-    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <hpvm.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_SZ 16
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << " Error!\n";                                  \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
-               float alpha, float beta) {
-  __hpvm__attributes(3, A, B, C, 1, C);
-  float c0, c1, c2, c3;
-  c0 = c1 = c2 = c3 = 0.0f;
-  int m = 4 * get_global_id(0);
-  int n = get_global_id(1);
-
-  for (int i = 0; i < k; ++i) {
-    float a0 = A[m + i * lda];
-    float a1 = A[m + 1 + i * lda];
-    float a2 = A[m + 2 + i * lda];
-    float a3 = A[m + 3 + i * lda];
-
-    float b = B[n + i * ldb];
-
-    c0 += a0 * b;
-    c1 += a1 * b;
-    c2 += a2 * b;
-    c3 += a3 * b;
-  }
-  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
-  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
-  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
-  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
-}
-
-__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
-                                          int n, int k, float alpha, float *A,
-                                          size_t bytesA, int lda, float *B,
-                                          size_t bytesB, int ldb, float beta,
-                                          float *C, size_t bytesC, int ldc) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_SZ) || (n % TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_SZ << "; n should be multiple of " << TILE_SZ
-              << std::endl;
-  }
-
-  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
-  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
-
-  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
-                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
-                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __hpvm__wait(sgemmDFG);
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  pb_InitializeTimerSet(&timers);
-  __hpvm__init();
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  llvm_hpvm_track_mem(&matA.front(), A_sz);
-  llvm_hpvm_track_mem(&matBT.front(), B_sz);
-  llvm_hpvm_track_mem(&matC.front(), C_sz);
-  // Copy A and B^T into device memory
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (size_t i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
-             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
-             matArow);
-
-  if (params->outFile) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Write C to file */
-    llvm_hpvm_request_mem(&matC.front(), C_sz);
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
-  llvm_hpvm_untrack_mem(&matA.front());
-  llvm_hpvm_untrack_mem(&matBT.front());
-  llvm_hpvm_untrack_mem(&matC.front());
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_PrintTimerSet(&timers);
-  __hpvm__cleanup();
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <hpvm.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_SZ 16
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << " Error!\n";                                  \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
-               float alpha, float beta) {
-  __hpvm__attributes(3, A, B, C, 1, C);
-  float c0, c1, c2, c3;
-  c0 = c1 = c2 = c3 = 0.0f;
-  int m = 4 * get_global_id(0);
-  int n = get_global_id(1);
-
-  for (int i = 0; i < k; ++i) {
-    float a0 = A[m + i * lda];
-    float a1 = A[m + 1 + i * lda];
-    float a2 = A[m + 2 + i * lda];
-    float a3 = A[m + 3 + i * lda];
-
-    float b = B[n + i * ldb];
-
-    c0 += a0 * b;
-    c1 += a1 * b;
-    c2 += a2 * b;
-    c3 += a3 * b;
-  }
-  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
-  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
-  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
-  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
-}
-
-__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
-                                          int n, int k, float alpha, float *A,
-                                          size_t bytesA, int lda, float *B,
-                                          size_t bytesB, int ldb, float beta,
-                                          float *C, size_t bytesC, int ldc) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_SZ) || (n % TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_SZ << "; n should be multiple of " << TILE_SZ
-              << std::endl;
-  }
-
-  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
-  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
-
-  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
-                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
-                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __hpvm__wait(sgemmDFG);
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  pb_InitializeTimerSet(&timers);
-  __hpvm__init();
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  llvm_hpvm_track_mem(&matA.front(), A_sz);
-  llvm_hpvm_track_mem(&matBT.front(), B_sz);
-  llvm_hpvm_track_mem(&matC.front(), C_sz);
-  // Copy A and B^T into device memory
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (size_t i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
-             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
-             matArow);
-
-  if (params->outFile) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    /* Write C to file */
-    llvm_hpvm_request_mem(&matC.front(), C_sz);
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
-  llvm_hpvm_untrack_mem(&matA.front());
-  llvm_hpvm_untrack_mem(&matBT.front());
-  llvm_hpvm_untrack_mem(&matC.front());
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_PrintTimerSet(&timers);
-  __hpvm__cleanup();
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void mysgemmNT( __global float *A, size_t bytesA, int lda, __global float *B, size_t bytesB, int ldb, __global float* C, size_t bytesC, int ldc, int k, float alpha, float beta )
-{
-/*
-    // Partial results 
-    float c[8];
-    for (int i=0; i < 8; i++)
-	c[i] = 0.0f;
-    float a[8];
-    float b[8];
-
-    int m = get_global_id(0) * 8;
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-        for (int id = 0; id < 8; id++) {
-	    a[id] = A[m + id + i * lda];
-	    b[id] = B[n + i * ldb];
-	    c[id] += a[id] * b[id];
-        }
-    }
-
-    for (int id = 0; id < 8; id++)
-        C[m+id+n*ldc] = C[m+id+n*ldc] * beta + alpha * c[id];
-*/
-
-    // Partial results 
-    float8 cp = (float8)(0.0f);
-
-    int m = get_global_id(0) * 8;
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-        float8 a = vload8(0, A + (m + i * lda));
-        float8 b = (float8)(B[n + i * ldb]);
-        cp += a * b;
-    }
-
-    float8 c = vload8(0, C + (m+n*ldc));
-    c = c * beta + alpha * cp;
-    vstore8(c, 0, C + (m+n*ldc));
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <hpvm.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_SZ 16
-#define VEC_SZ 8
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << " Error!\n";                                  \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
-               float alpha, float beta) {
-  __hpvm__hint(hpvm::GPU_TARGET);
-  __hpvm__attributes(3, A, B, C, 1, C);
-
-  float c = 0.0f;
-  int m = get_global_id(0);
-  int n = get_global_id(1);
-
-  for (int i = 0; i < k; ++i) {
-    float a = A[m + i * lda];
-    float b = B[n + i * ldb];
-    c += a * b;
-  }
-  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c;
-  /*
-      Will be substituted by this kernel at the llvm level
-      // Partial results
-      float8 cp = (float8)(0.0f);
-
-      int m = get_global_id(0) * 8;
-      int n = get_global_id(1);
-
-      for (int i = 0; i < k; ++i) {
-          float8 a = vload8(0, A + (m + i * lda));
-          float8 b = (float8)(B[n + i * ldb]);
-          cp += a * b;
-      }
-
-      float8 c = vload8(0, C + (m+n*ldc));
-      c = c * beta + alpha * cp;
-      vstore8(c, 0, C + (m+n*ldc));
-  */
-}
-
-__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
-                                          int n, int k, float alpha, float *A,
-                                          size_t bytesA, int lda, float *B,
-                                          size_t bytesB, int ldb, float beta,
-                                          float *C, size_t bytesC, int ldc) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_SZ) || (n % TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_SZ << "; n should be multiple of " << TILE_SZ
-              << std::endl;
-  }
-
-  unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
-  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
-
-  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
-                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
-                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __hpvm__wait(sgemmDFG);
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_InitializeTimerSet(&timers);
-  __hpvm__init();
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  llvm_hpvm_track_mem(&matA.front(), A_sz);
-  llvm_hpvm_track_mem(&matBT.front(), B_sz);
-  llvm_hpvm_track_mem(&matC.front(), C_sz);
-  // Copy A and B^T into device memory
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  for (size_t i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
-             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
-             matArow);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_hpvm_request_mem(&matC.front(), C_sz);
-
-  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
-  llvm_hpvm_untrack_mem(&matA.front());
-  llvm_hpvm_untrack_mem(&matBT.front());
-  llvm_hpvm_untrack_mem(&matC.front());
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  pb_PrintTimerSet(&timers);
-  __hpvm__cleanup();
-
-  if (params->outFile) {
-
-    /* Write C to file */
-    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=hpvm
-SRCDIR_OBJS=io.ll #compute_gold.o
-HPVM_OBJS=main.hpvm.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
--- a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-// Parameters of tile sizes
-#define TILE_N 8
-#define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
-{
-
-    float c[TILE_N];
-    for (int i=0; i < TILE_N; i++)
-	c[i] = 0.0f;
-   
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
-
-    int b_base = 0;
-
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a; 
-        b_base = get_group_id(1) * TILE_N + i * ldb;
-
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    for (int kk = 0; kk < TILE_N; kk++)
-		c[kk] += a * B[b_base + j * ldb + kk];
-
-	}
-    }
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
-    }
-/*
-    Will be substituted by this kernel at the llvm level
-
-    // Partial results 
-    floatn cp = (floatn)(0.0f);
-
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
-
-    int b_base = 0;
-
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a;
-        b_base = get_group_id(1) * TILE_N + i * ldb;
-
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    cp += a * vloadn(0, B + b_base + j * ldb);
-	}
-    }
-
-    cp = alpha * cp;
-    float c[TILE_N];
-    c[0] = cp.s0;
-    c[1] = cp.s1;
-    c[2] = cp.s2;
-    c[3] = cp.s3;
-    c[4] = cp.s4;
-    c[5] = cp.s5;
-    c[6] = cp.s6;
-    c[7] = cp.s7;
-
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
-    }
-
-*/
-
-*/
-
-}