Creating a new test case visc_mri-q. Initial commit

2251d376 · Prakalp Srivastava · cd8e1fc5 · 2251d376 · 2251d376 · 2251d376
Commit 2251d376 authored 10 years ago by Prakalp Srivastava
--- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c
@@ -8,23 +8,88 @@

 #include <stdio.h>
 #include <malloc.h>
-#include <CL/cl.h>
-#include "ocl.h"
+#include <math.h>
+
 #include "macros.h"
 #include "computeQ.h"

 #define NC 4

+void __attribute__ ((noinline)) computePhiMag_GPU_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
+  int indexK = get_global_id(0);
+  if (indexK < numK) {
+    float real = phiR[indexK];
+    float imag = phiI[indexK];
+    phiMag[indexK] = real*real + imag*imag;
+  }
+}
+
+
+
 void computePhiMag_GPU(int numK, float* phiR, float* phiI, float* phiMag)
 {
  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
    phiMagBlocks++;
-  
+
  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;

-  ComputePhiMag_GPU(phiR, phiI, phiMag, numK);
+  size_t bytes_phi = numK * sizeof(float);
+
+  computePhiMag_GPU_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
+}
+
+void __attribute__ ((noinline)) computeQ_GPU_kernel(int numK, int kGlobalIndex,
+	     float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
+	     float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) 
+{
+
+  float sX[NC];
+  float sY[NC];
+  float sZ[NC];
+  float sQr[NC];
+  float sQi[NC];
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+
+    sX[tx] = x[xIndex];
+    sY[tx] = y[xIndex];
+    sZ[tx] = z[xIndex];
+    sQr[tx] = Qr[xIndex];
+    sQi[tx] = Qi[xIndex];
+  }
+
+  // Loop over all elements of K in constant mem to compute a partial value
+  // for X.
+  int kIndex = 0;
+  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
+       kIndex ++, kGlobalIndex ++) {
+    float kx = ck[kIndex].Kx;
+    float ky = ck[kIndex].Ky;
+    float kz = ck[kIndex].Kz;
+    float pm = ck[kIndex].PhiMag;
+
+    #pragma unroll
+    for (int tx = 0; tx < NC; tx++) {
+      float expArg = PIx2 *
+                   (kx * sX[tx] +
+                    ky * sY[tx] +
+                    kz * sZ[tx]);
+      sQr[tx] += pm * cos(expArg);
+      sQi[tx] += pm * sin(expArg);
+    }
+  }
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+    Qr[xIndex] = sQr[tx];
+    Qi[xIndex] = sQi[tx];
+  }
+
 }

 void computeQ_GPU (int numK,int numX,
@@ -43,9 +108,8 @@ void computeQ_GPU (int numK,int numX,
  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;

-  cl_int clStatus;
-  cl_mem ck;
-  ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  // size in bytes = numElems*sizeof(struct kValues))

  int QGrid;
  for (QGrid = 0; QGrid < QGrids; QGrid++) {
@@ -53,28 +117,11 @@ void computeQ_GPU (int numK,int numX,
    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
    struct kValues* kValsTile = kVals + QGridBase;
    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
+    size_t bytes_x = numX * sizeof(float);
+    size_t bytes_kValTile = numElems*sizeof(struct kValues);

-    clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    
-    ComputeQ_GPU(numK, QGridBase, x, y, z, Qr, Qi, kValsTile);
-
-    clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
-    clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
-    clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
-    CHECK_ERROR("clSetKernelArg")
-
-
-
-    printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
-
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
+    computeQ_GPU_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
+    printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
  }
 }


--- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c
@@ -25,14 +25,196 @@
 */

 #include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <math.h>
 #include <sys/time.h>
 #include <parboil.h>
-#include <CL/cl.h>
-
-#include "ocl.h"
-#include "file.h"
+#include <endian.h>
+#include <inttypes.h>
 #include "macros.h"
-#include "computeQ.h"
+
+#define NC 4
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+# error "File I/O is not implemented for this system: wrong endianness."
+#endif
+
+void inputData(char* fName, int* _numK, int* _numX,
+               float** kx, float** ky, float** kz,
+               float** x, float** y, float** z,
+               float** phiR, float** phiI)
+{
+  int numK, numX;
+  FILE* fid = fopen(fName, "r");
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open input file\n");
+      exit(-1);
+    }
+  fread (&numK, sizeof (int), 1, fid);
+  *_numK = numK;
+  fread (&numX, sizeof (int), 1, fid);
+  *_numX = numX;
+  *kx = (float *) memalign(16, numK * sizeof (float));
+  fread (*kx, sizeof (float), numK, fid);
+  *ky = (float *) memalign(16, numK * sizeof (float));
+  fread (*ky, sizeof (float), numK, fid);
+  *kz = (float *) memalign(16, numK * sizeof (float));
+  fread (*kz, sizeof (float), numK, fid);
+  *x = (float *) memalign(16, numX * sizeof (float));
+  fread (*x, sizeof (float), numX, fid);
+  *y = (float *) memalign(16, numX * sizeof (float));
+  fread (*y, sizeof (float), numX, fid);
+  *z = (float *) memalign(16, numX * sizeof (float));
+  fread (*z, sizeof (float), numX, fid);
+  *phiR = (float *) memalign(16, numK * sizeof (float));
+  fread (*phiR, sizeof (float), numK, fid);
+  *phiI = (float *) memalign(16, numK * sizeof (float));
+  fread (*phiI, sizeof (float), numK, fid);
+  fclose (fid); 
+}
+
+void outputData(char* fName, float* outR, float* outI, int numX)
+{
+  FILE* fid = fopen(fName, "w");
+  uint32_t tmp32;
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open output file\n");
+      exit(-1);
+    }
+
+  /* Write the data size */
+  tmp32 = numX;
+  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
+
+  /* Write the reconstructed data */
+  fwrite (outR, sizeof (float), numX, fid);
+  fwrite (outI, sizeof (float), numX, fid);
+  fclose (fid);
+}
+
+
+void __attribute__ ((noinline)) computePhiMag_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
+  int indexK = get_global_id(0);
+  if (indexK < numK) {
+    float real = phiR[indexK];
+    float imag = phiI[indexK];
+    phiMag[indexK] = real*real + imag*imag;
+  }
+}
+
+
+
+void __attribute__ ((noinline)) computePhiMag(int numK, float* phiR, float* phiI, float* phiMag)
+{
+  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
+    phiMagBlocks++;
+
+  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+
+  size_t bytes_phi = numK * sizeof(float);
+
+  computePhiMag_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
+}
+
+void __attribute__ ((noinline)) computeQ_kernel(int numK, int kGlobalIndex,
+	     float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
+	     float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) 
+{
+
+  float sX[NC];
+  float sY[NC];
+  float sZ[NC];
+  float sQr[NC];
+  float sQi[NC];
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+
+    sX[tx] = x[xIndex];
+    sY[tx] = y[xIndex];
+    sZ[tx] = z[xIndex];
+    sQr[tx] = Qr[xIndex];
+    sQi[tx] = Qi[xIndex];
+  }
+
+  // Loop over all elements of K in constant mem to compute a partial value
+  // for X.
+  int kIndex = 0;
+  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
+       kIndex ++, kGlobalIndex ++) {
+    float kx = ck[kIndex].Kx;
+    float ky = ck[kIndex].Ky;
+    float kz = ck[kIndex].Kz;
+    float pm = ck[kIndex].PhiMag;
+
+    #pragma unroll
+    for (int tx = 0; tx < NC; tx++) {
+      float expArg = PIx2 *
+                   (kx * sX[tx] +
+                    ky * sY[tx] +
+                    kz * sZ[tx]);
+      sQr[tx] += pm * cos(expArg);
+      sQi[tx] += pm * sin(expArg);
+    }
+  }
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+    Qr[xIndex] = sQr[tx];
+    Qi[xIndex] = sQi[tx];
+  }
+
+}
+
+void __attribute__ ((noinline)) computeQ (int numK,int numX,
+		   float* x, float* y, float* z,
+		   struct kValues* kVals,
+		   float* Qr, float* Qi
+		   )
+{
+  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
+  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
+    QGrids++;
+  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
+  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
+    QBlocks++;
+
+  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
+  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
+
+  //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  // size in bytes = numElems*sizeof(struct kValues))
+
+  int QGrid;
+  for (QGrid = 0; QGrid < QGrids; QGrid++) {
+    // Put the tile of K values into constant mem
+    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
+    struct kValues* kValsTile = kVals + QGridBase;
+    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
+    size_t bytes_x = numX * sizeof(float);
+    size_t bytes_kValTile = numElems*sizeof(struct kValues);
+
+    computeQ_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
+    printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
+  }
+}
+
+void createDataStructsCPU(int numK, int numX, float** phiMag,
+	 float** Qr, float** Qi)
+{
+  *phiMag = (float* ) memalign(16, numK * sizeof(float));
+  *Qr = (float*) memalign(16, numX * sizeof (float));
+  *Qi = (float*) memalign(16, numX * sizeof (float));
+}
+

 int
 main (int argc, char *argv[]) {
@@ -96,10 +278,10 @@ main (int argc, char *argv[]) {
  /* GPU section 1 (precompute PhiMag) */
  {
    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    
+
    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

-    computePhiMag_GPU(numK, phiR, phiI, phiMag);
+    computePhiMag(numK, phiR, phiI, phiMag);

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

@@ -118,14 +300,14 @@ main (int argc, char *argv[]) {
  }

  free(phiMag);
-  
+
  /* GPU section 2 */
  {
    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

-    computeQ_GPU(numK, numX, x, y, z, kVals, Qr, Qi);
+    computeQ(numK, numX, x, y, z, kVals, Qr, Qi);

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);


--- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll