From 2251d376559991b1f4bc60fa4372a938bbf2474f Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Sat, 29 Nov 2014 00:49:41 +0000
Subject: [PATCH] Creating a new test case visc_mri-q. Initial commit

---
 .../benchmarks/mri-q/src/visc/computeQ.c      | 103 ++-
 .../parboil/benchmarks/mri-q/src/visc/main.c  | 200 ++++-
 .../benchmarks/mri-q/src/visc/visc_mri-q.ll   | 740 ++++++++++++++++++
 3 files changed, 1006 insertions(+), 37 deletions(-)
 create mode 100644 llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll

diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c
index fe58267d66..6b106f2515 100644
--- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c
@@ -8,23 +8,88 @@
 
 #include <stdio.h>
 #include <malloc.h>
-#include <CL/cl.h>
-#include "ocl.h"
+#include <math.h>
+
 #include "macros.h"
 #include "computeQ.h"
 
 #define NC 4
 
+void __attribute__ ((noinline)) computePhiMag_GPU_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
+  int indexK = get_global_id(0);
+  if (indexK < numK) {
+    float real = phiR[indexK];
+    float imag = phiI[indexK];
+    phiMag[indexK] = real*real + imag*imag;
+  }
+}
+
+
+
 void computePhiMag_GPU(int numK, float* phiR, float* phiI, float* phiMag)
 {
   int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
   if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
     phiMagBlocks++;
-  
+
   size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
   size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
 
-  ComputePhiMag_GPU(phiR, phiI, phiMag, numK);
+  size_t bytes_phi = numK * sizeof(float);
+
+  computePhiMag_GPU_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
+}
+
+void __attribute__ ((noinline)) computeQ_GPU_kernel(int numK, int kGlobalIndex,
+	     float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
+	     float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) 
+{
+
+  float sX[NC];
+  float sY[NC];
+  float sZ[NC];
+  float sQr[NC];
+  float sQi[NC];
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+
+    sX[tx] = x[xIndex];
+    sY[tx] = y[xIndex];
+    sZ[tx] = z[xIndex];
+    sQr[tx] = Qr[xIndex];
+    sQi[tx] = Qi[xIndex];
+  }
+
+  // Loop over all elements of K in constant mem to compute a partial value
+  // for X.
+  int kIndex = 0;
+  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
+       kIndex ++, kGlobalIndex ++) {
+    float kx = ck[kIndex].Kx;
+    float ky = ck[kIndex].Ky;
+    float kz = ck[kIndex].Kz;
+    float pm = ck[kIndex].PhiMag;
+
+    #pragma unroll
+    for (int tx = 0; tx < NC; tx++) {
+      float expArg = PIx2 *
+                   (kx * sX[tx] +
+                    ky * sY[tx] +
+                    kz * sZ[tx]);
+      sQr[tx] += pm * cos(expArg);
+      sQi[tx] += pm * sin(expArg);
+    }
+  }
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+    Qr[xIndex] = sQr[tx];
+    Qi[xIndex] = sQi[tx];
+  }
+
 }
 
 void computeQ_GPU (int numK,int numX,
@@ -43,9 +108,8 @@ void computeQ_GPU (int numK,int numX,
   size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
   size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
 
-  cl_int clStatus;
-  cl_mem ck;
-  ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  // size in bytes = numElems*sizeof(struct kValues))
 
   int QGrid;
   for (QGrid = 0; QGrid < QGrids; QGrid++) {
@@ -53,28 +117,11 @@ void computeQ_GPU (int numK,int numX,
     int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
     struct kValues* kValsTile = kVals + QGridBase;
     int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
+    size_t bytes_x = numX * sizeof(float);
+    size_t bytes_kValTile = numElems*sizeof(struct kValues);
 
-    clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    
-    ComputeQ_GPU(numK, QGridBase, x, y, z, Qr, Qi, kValsTile);
-
-    clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
-    clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
-    clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
-    CHECK_ERROR("clSetKernelArg")
-
-
-
-    printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
-
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
+    computeQ_GPU_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
+    printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
   }
 }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c
index 03e388bb62..d3a5c71666 100644
--- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c
@@ -25,14 +25,196 @@
  */
 
 #include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <math.h>
 #include <sys/time.h>
 #include <parboil.h>
-#include <CL/cl.h>
-
-#include "ocl.h"
-#include "file.h"
+#include <endian.h>
+#include <inttypes.h>
 #include "macros.h"
-#include "computeQ.h"
+
+#define NC 4
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+# error "File I/O is not implemented for this system: wrong endianness."
+#endif
+
+void inputData(char* fName, int* _numK, int* _numX,
+               float** kx, float** ky, float** kz,
+               float** x, float** y, float** z,
+               float** phiR, float** phiI)
+{
+  int numK, numX;
+  FILE* fid = fopen(fName, "r");
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open input file\n");
+      exit(-1);
+    }
+  fread (&numK, sizeof (int), 1, fid);
+  *_numK = numK;
+  fread (&numX, sizeof (int), 1, fid);
+  *_numX = numX;
+  *kx = (float *) memalign(16, numK * sizeof (float));
+  fread (*kx, sizeof (float), numK, fid);
+  *ky = (float *) memalign(16, numK * sizeof (float));
+  fread (*ky, sizeof (float), numK, fid);
+  *kz = (float *) memalign(16, numK * sizeof (float));
+  fread (*kz, sizeof (float), numK, fid);
+  *x = (float *) memalign(16, numX * sizeof (float));
+  fread (*x, sizeof (float), numX, fid);
+  *y = (float *) memalign(16, numX * sizeof (float));
+  fread (*y, sizeof (float), numX, fid);
+  *z = (float *) memalign(16, numX * sizeof (float));
+  fread (*z, sizeof (float), numX, fid);
+  *phiR = (float *) memalign(16, numK * sizeof (float));
+  fread (*phiR, sizeof (float), numK, fid);
+  *phiI = (float *) memalign(16, numK * sizeof (float));
+  fread (*phiI, sizeof (float), numK, fid);
+  fclose (fid); 
+}
+
+void outputData(char* fName, float* outR, float* outI, int numX)
+{
+  FILE* fid = fopen(fName, "w");
+  uint32_t tmp32;
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open output file\n");
+      exit(-1);
+    }
+
+  /* Write the data size */
+  tmp32 = numX;
+  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
+
+  /* Write the reconstructed data */
+  fwrite (outR, sizeof (float), numX, fid);
+  fwrite (outI, sizeof (float), numX, fid);
+  fclose (fid);
+}
+
+
+void __attribute__ ((noinline)) computePhiMag_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
+  int indexK = get_global_id(0);
+  if (indexK < numK) {
+    float real = phiR[indexK];
+    float imag = phiI[indexK];
+    phiMag[indexK] = real*real + imag*imag;
+  }
+}
+
+
+
+void __attribute__ ((noinline)) computePhiMag(int numK, float* phiR, float* phiI, float* phiMag)
+{
+  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
+    phiMagBlocks++;
+
+  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
+
+  size_t bytes_phi = numK * sizeof(float);
+
+  computePhiMag_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
+}
+
+void __attribute__ ((noinline)) computeQ_kernel(int numK, int kGlobalIndex,
+	     float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
+	     float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) 
+{
+
+  float sX[NC];
+  float sY[NC];
+  float sZ[NC];
+  float sQr[NC];
+  float sQi[NC];
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+
+    sX[tx] = x[xIndex];
+    sY[tx] = y[xIndex];
+    sZ[tx] = z[xIndex];
+    sQr[tx] = Qr[xIndex];
+    sQi[tx] = Qi[xIndex];
+  }
+
+  // Loop over all elements of K in constant mem to compute a partial value
+  // for X.
+  int kIndex = 0;
+  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
+       kIndex ++, kGlobalIndex ++) {
+    float kx = ck[kIndex].Kx;
+    float ky = ck[kIndex].Ky;
+    float kz = ck[kIndex].Kz;
+    float pm = ck[kIndex].PhiMag;
+
+    #pragma unroll
+    for (int tx = 0; tx < NC; tx++) {
+      float expArg = PIx2 *
+                   (kx * sX[tx] +
+                    ky * sY[tx] +
+                    kz * sZ[tx]);
+      sQr[tx] += pm * cos(expArg);
+      sQi[tx] += pm * sin(expArg);
+    }
+  }
+
+  #pragma unroll
+  for (int tx = 0; tx < NC; tx++) {
+    int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
+    Qr[xIndex] = sQr[tx];
+    Qi[xIndex] = sQi[tx];
+  }
+
+}
+
+void __attribute__ ((noinline)) computeQ (int numK,int numX,
+		   float* x, float* y, float* z,
+		   struct kValues* kVals,
+		   float* Qr, float* Qi
+		   )
+{
+  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
+  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
+    QGrids++;
+  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
+  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
+    QBlocks++;
+
+  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
+  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
+
+  //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
+  // size in bytes = numElems*sizeof(struct kValues))
+
+  int QGrid;
+  for (QGrid = 0; QGrid < QGrids; QGrid++) {
+    // Put the tile of K values into constant mem
+    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
+    struct kValues* kValsTile = kVals + QGridBase;
+    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
+    size_t bytes_x = numX * sizeof(float);
+    size_t bytes_kValTile = numElems*sizeof(struct kValues);
+
+    computeQ_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
+    printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
+  }
+}
+
+void createDataStructsCPU(int numK, int numX, float** phiMag,
+	 float** Qr, float** Qi)
+{
+  *phiMag = (float* ) memalign(16, numK * sizeof(float));
+  *Qr = (float*) memalign(16, numX * sizeof (float));
+  *Qi = (float*) memalign(16, numX * sizeof (float));
+}
+
 
 int
 main (int argc, char *argv[]) {
@@ -96,10 +278,10 @@ main (int argc, char *argv[]) {
   /* GPU section 1 (precompute PhiMag) */
   {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    
+
     pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
-    computePhiMag_GPU(numK, phiR, phiI, phiMag);
+    computePhiMag(numK, phiR, phiI, phiMag);
 
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
@@ -118,14 +300,14 @@ main (int argc, char *argv[]) {
   }
 
   free(phiMag);
-  
+
   /* GPU section 2 */
   {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
     pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
-    computeQ_GPU(numK, numX, x, y, z, kVals, Qr, Qi);
+    computeQ(numK, numX, x, y, z, kVals, Qr, Qi);
 
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll
new file mode 100644
index 0000000000..0f56a342f1
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll
@@ -0,0 +1,740 @@
+; ModuleID = 'main.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-redhat-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+%struct.kValues = type { float, float, float, float }
+%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [8 x %struct.pb_Timer], [8 x %struct.pb_SubTimerList*] }
+%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
+%struct.pb_Timer = type { i32, i64, i64 }
+%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
+%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
+%struct.pb_Parameters = type { i8*, i8** }
+
+@.str = private unnamed_addr constant [2 x i8] c"r\00", align 1
+@stderr = external global %struct._IO_FILE*
+@.str1 = private unnamed_addr constant [24 x i8] c"Cannot open input file\0A\00", align 1
+@.str2 = private unnamed_addr constant [2 x i8] c"w\00", align 1
+@.str3 = private unnamed_addr constant [25 x i8] c"Cannot open output file\0A\00", align 1
+@.str4 = private unnamed_addr constant [23 x i8] c"Grid: %lu, Block: %lu\0A\00", align 1
+@.str5 = private unnamed_addr constant [30 x i8] c"Expecting one input filename\0A\00", align 1
+@.str6 = private unnamed_addr constant [32 x i8] c"Expecting an integer parameter\0A\00", align 1
+@.str7 = private unnamed_addr constant [65 x i8] c"%d pixels in output; %d samples in trajectory; using %d samples\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define void @inputData(i8* nocapture readonly %fName, i32* nocapture %_numK, i32* nocapture %_numX, float** nocapture %kx, float** nocapture %ky, float** nocapture %kz, float** nocapture %x, float** nocapture %y, float** nocapture %z, float** nocapture %phiR, float** nocapture %phiI) #0 {
+  %numK = alloca i32, align 4
+  %numX = alloca i32, align 4
+  %1 = tail call %struct._IO_FILE* @fopen(i8* %fName, i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) #6
+  %2 = icmp eq %struct._IO_FILE* %1, null
+  br i1 %2, label %3, label %6
+
+; <label>:3                                       ; preds = %0
+  %4 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1
+  %5 = tail call i64 @fwrite(i8* getelementptr inbounds ([24 x i8]* @.str1, i64 0, i64 0), i64 23, i64 1, %struct._IO_FILE* %4) #7
+  tail call void @exit(i32 -1) #8
+  unreachable
+
+; <label>:6                                       ; preds = %0
+  %7 = bitcast i32* %numK to i8*
+  %8 = call i64 @fread(i8* %7, i64 4, i64 1, %struct._IO_FILE* %1) #6
+  %9 = load i32* %numK, align 4, !tbaa !5
+  store i32 %9, i32* %_numK, align 4, !tbaa !5
+  %10 = bitcast i32* %numX to i8*
+  %11 = call i64 @fread(i8* %10, i64 4, i64 1, %struct._IO_FILE* %1) #6
+  %12 = load i32* %numX, align 4, !tbaa !5
+  store i32 %12, i32* %_numX, align 4, !tbaa !5
+  %13 = sext i32 %9 to i64
+  %14 = shl nsw i64 %13, 2
+  %15 = tail call noalias i8* @memalign(i64 16, i64 %14) #6
+  %16 = bitcast i8* %15 to float*
+  store float* %16, float** %kx, align 8, !tbaa !1
+  %17 = tail call i64 @fread(i8* %15, i64 4, i64 %13, %struct._IO_FILE* %1) #6
+  %18 = tail call noalias i8* @memalign(i64 16, i64 %14) #6
+  %19 = bitcast i8* %18 to float*
+  store float* %19, float** %ky, align 8, !tbaa !1
+  %20 = tail call i64 @fread(i8* %18, i64 4, i64 %13, %struct._IO_FILE* %1) #6
+  %21 = tail call noalias i8* @memalign(i64 16, i64 %14) #6
+  %22 = bitcast i8* %21 to float*
+  store float* %22, float** %kz, align 8, !tbaa !1
+  %23 = tail call i64 @fread(i8* %21, i64 4, i64 %13, %struct._IO_FILE* %1) #6
+  %24 = sext i32 %12 to i64
+  %25 = shl nsw i64 %24, 2
+  %26 = tail call noalias i8* @memalign(i64 16, i64 %25) #6
+  %27 = bitcast i8* %26 to float*
+  store float* %27, float** %x, align 8, !tbaa !1
+  %28 = tail call i64 @fread(i8* %26, i64 4, i64 %24, %struct._IO_FILE* %1) #6
+  %29 = tail call noalias i8* @memalign(i64 16, i64 %25) #6
+  %30 = bitcast i8* %29 to float*
+  store float* %30, float** %y, align 8, !tbaa !1
+  %31 = tail call i64 @fread(i8* %29, i64 4, i64 %24, %struct._IO_FILE* %1) #6
+  %32 = tail call noalias i8* @memalign(i64 16, i64 %25) #6
+  %33 = bitcast i8* %32 to float*
+  store float* %33, float** %z, align 8, !tbaa !1
+  %34 = tail call i64 @fread(i8* %32, i64 4, i64 %24, %struct._IO_FILE* %1) #6
+  %35 = tail call noalias i8* @memalign(i64 16, i64 %14) #6
+  %36 = bitcast i8* %35 to float*
+  store float* %36, float** %phiR, align 8, !tbaa !1
+  %37 = tail call i64 @fread(i8* %35, i64 4, i64 %13, %struct._IO_FILE* %1) #6
+  %38 = tail call noalias i8* @memalign(i64 16, i64 %14) #6
+  %39 = bitcast i8* %38 to float*
+  store float* %39, float** %phiI, align 8, !tbaa !1
+  %40 = tail call i64 @fread(i8* %38, i64 4, i64 %13, %struct._IO_FILE* %1) #6
+  %41 = tail call i32 @fclose(%struct._IO_FILE* %1) #6
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) #1
+
+; Function Attrs: noreturn nounwind
+declare void @exit(i32) #2
+
+; Function Attrs: nounwind
+declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind
+declare noalias i8* @memalign(i64, i64) #1
+
+; Function Attrs: nounwind
+declare i32 @fclose(%struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define void @outputData(i8* nocapture readonly %fName, float* nocapture %outR, float* nocapture %outI, i32 %numX) #0 {
+  %tmp32 = alloca i32, align 4
+  %1 = tail call %struct._IO_FILE* @fopen(i8* %fName, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)) #6
+  %2 = icmp eq %struct._IO_FILE* %1, null
+  br i1 %2, label %3, label %6
+
+; <label>:3                                       ; preds = %0
+  %4 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1
+  %5 = tail call i64 @fwrite(i8* getelementptr inbounds ([25 x i8]* @.str3, i64 0, i64 0), i64 24, i64 1, %struct._IO_FILE* %4) #7
+  tail call void @exit(i32 -1) #8
+  unreachable
+
+; <label>:6                                       ; preds = %0
+  store i32 %numX, i32* %tmp32, align 4, !tbaa !5
+  %7 = bitcast i32* %tmp32 to i8*
+  %8 = call i64 @fwrite(i8* %7, i64 4, i64 1, %struct._IO_FILE* %1) #6
+  %9 = bitcast float* %outR to i8*
+  %10 = sext i32 %numX to i64
+  %11 = tail call i64 @fwrite(i8* %9, i64 4, i64 %10, %struct._IO_FILE* %1) #6
+  %12 = bitcast float* %outI to i8*
+  %13 = tail call i64 @fwrite(i8* %12, i64 4, i64 %10, %struct._IO_FILE* %1) #6
+  %14 = tail call i32 @fclose(%struct._IO_FILE* %1) #6
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
+
+; Function Attrs: noinline nounwind uwtable
+define void @computePhiMag_kernel(float* nocapture readonly %phiR, i64 %bytes_phiR, float* nocapture readonly %phiI, i64 %bytes_phiI, float* nocapture %phiMag, i64 %bytes_phiMag, i32 %numK) #3 {
+  %1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #6
+  %2 = icmp slt i32 %1, %numK
+  br i1 %2, label %3, label %13
+
+; <label>:3                                       ; preds = %0
+  %4 = sext i32 %1 to i64
+  %5 = getelementptr inbounds float* %phiR, i64 %4
+  %6 = load float* %5, align 4, !tbaa !7
+  %7 = getelementptr inbounds float* %phiI, i64 %4
+  %8 = load float* %7, align 4, !tbaa !7
+  %9 = fmul fast float %6, %6
+  %10 = fmul fast float %8, %8
+  %11 = fadd fast float %9, %10
+  %12 = getelementptr inbounds float* %phiMag, i64 %4
+  store float %11, float* %12, align 4, !tbaa !7
+  br label %13
+
+; <label>:13                                      ; preds = %3, %0
+  ret void
+}
+
+declare i32 @get_global_id(...) #4
+
+; Function Attrs: noinline nounwind uwtable
+define void @computePhiMag(i32 %numK, float* nocapture readonly %phiR, float* nocapture readonly %phiI, float* nocapture %phiMag) #3 {
+  tail call void @computePhiMag_kernel(float* %phiR, i64 undef, float* %phiI, i64 undef, float* %phiMag, i64 undef, i32 %numK)
+  ret void
+}
+
+; Function Attrs: noinline nounwind uwtable
+define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* nocapture readonly %x, i64 %bytes_x, float* nocapture readonly %y, i64 %bytes_y, float* nocapture readonly %z, i64 %bytes_z, float* nocapture %Qr, i64 %bytes_Qr, float* nocapture %Qi, i64 %bytes_Qi, %struct.kValues* nocapture readonly %ck, i64 %bytes_ck) #3 {
+.preheader613:
+  %sX = alloca [4 x float], align 16
+  %sY = alloca [4 x float], align 16
+  %sZ = alloca [4 x float], align 16
+  %sQi = alloca [4 x float], align 16
+  %0 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %1 = shl i32 %0, 8
+  %2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %3 = shl i32 %2, 2
+  %4 = add i32 %1, %3
+  %5 = sext i32 %4 to i64
+  %6 = getelementptr inbounds float* %x, i64 %5
+  %7 = load float* %6, align 4, !tbaa !7
+  %8 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 0
+  store float %7, float* %8, align 16, !tbaa !7
+  %9 = getelementptr inbounds float* %y, i64 %5
+  %10 = load float* %9, align 4, !tbaa !7
+  %11 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 0
+  store float %10, float* %11, align 16, !tbaa !7
+  %12 = getelementptr inbounds float* %z, i64 %5
+  %13 = load float* %12, align 4, !tbaa !7
+  %14 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 0
+  store float %13, float* %14, align 16, !tbaa !7
+  %15 = getelementptr inbounds float* %Qr, i64 %5
+  %16 = load float* %15, align 4, !tbaa !7
+  %17 = getelementptr inbounds float* %Qi, i64 %5
+  %18 = load float* %17, align 4, !tbaa !7
+  %19 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 0
+  store float %18, float* %19, align 16, !tbaa !7
+  %20 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %21 = shl i32 %20, 8
+  %22 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %23 = shl i32 %22, 2
+  %24 = or i32 %21, 1
+  %25 = add i32 %24, %23
+  %26 = sext i32 %25 to i64
+  %27 = getelementptr inbounds float* %x, i64 %26
+  %28 = load float* %27, align 4, !tbaa !7
+  %29 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 1
+  store float %28, float* %29, align 4, !tbaa !7
+  %30 = getelementptr inbounds float* %y, i64 %26
+  %31 = load float* %30, align 4, !tbaa !7
+  %32 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 1
+  store float %31, float* %32, align 4, !tbaa !7
+  %33 = getelementptr inbounds float* %z, i64 %26
+  %34 = load float* %33, align 4, !tbaa !7
+  %35 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 1
+  store float %34, float* %35, align 4, !tbaa !7
+  %36 = getelementptr inbounds float* %Qr, i64 %26
+  %37 = load float* %36, align 4, !tbaa !7
+  %38 = getelementptr inbounds float* %Qi, i64 %26
+  %39 = load float* %38, align 4, !tbaa !7
+  %40 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 1
+  store float %39, float* %40, align 4, !tbaa !7
+  %41 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %42 = shl i32 %41, 8
+  %43 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %44 = shl i32 %43, 2
+  %45 = or i32 %42, 2
+  %46 = add i32 %45, %44
+  %47 = sext i32 %46 to i64
+  %48 = getelementptr inbounds float* %x, i64 %47
+  %49 = load float* %48, align 4, !tbaa !7
+  %50 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 2
+  store float %49, float* %50, align 8, !tbaa !7
+  %51 = getelementptr inbounds float* %y, i64 %47
+  %52 = load float* %51, align 4, !tbaa !7
+  %53 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 2
+  store float %52, float* %53, align 8, !tbaa !7
+  %54 = getelementptr inbounds float* %z, i64 %47
+  %55 = load float* %54, align 4, !tbaa !7
+  %56 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 2
+  store float %55, float* %56, align 8, !tbaa !7
+  %57 = getelementptr inbounds float* %Qr, i64 %47
+  %58 = load float* %57, align 4, !tbaa !7
+  %59 = getelementptr inbounds float* %Qi, i64 %47
+  %60 = load float* %59, align 4, !tbaa !7
+  %61 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 2
+  store float %60, float* %61, align 8, !tbaa !7
+  %62 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %63 = shl i32 %62, 8
+  %64 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %65 = shl i32 %64, 2
+  %66 = or i32 %63, 3
+  %67 = add i32 %66, %65
+  %68 = sext i32 %67 to i64
+  %69 = getelementptr inbounds float* %x, i64 %68
+  %70 = load float* %69, align 4, !tbaa !7
+  %71 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 3
+  store float %70, float* %71, align 4, !tbaa !7
+  %72 = getelementptr inbounds float* %y, i64 %68
+  %73 = load float* %72, align 4, !tbaa !7
+  %74 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 3
+  store float %73, float* %74, align 4, !tbaa !7
+  %75 = getelementptr inbounds float* %z, i64 %68
+  %76 = load float* %75, align 4, !tbaa !7
+  %77 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 3
+  store float %76, float* %77, align 4, !tbaa !7
+  %78 = getelementptr inbounds float* %Qr, i64 %68
+  %79 = load float* %78, align 4, !tbaa !7
+  %80 = getelementptr inbounds float* %Qi, i64 %68
+  %81 = load float* %80, align 4, !tbaa !7
+  %82 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 3
+  store float %81, float* %82, align 4, !tbaa !7
+  %83 = icmp slt i32 %kGlobalIndex, %numK
+  br i1 %83, label %.lr.ph, label %.preheader
+
+.lr.ph:                                           ; preds = %.preheader613
+  %84 = sub i32 %kGlobalIndex, %numK
+  %85 = icmp ugt i32 %84, -1024
+  %.op = sub i32 0, %84
+  %86 = select i1 %85, i32 %.op, i32 1024
+  br label %130
+
+.preheader:                                       ; preds = %130, %.preheader613
+  %87 = phi float [ %81, %.preheader613 ], [ %218, %130 ]
+  %88 = phi float [ %79, %.preheader613 ], [ %212, %130 ]
+  %89 = phi float [ %60, %.preheader613 ], [ %200, %130 ]
+  %90 = phi float [ %58, %.preheader613 ], [ %195, %130 ]
+  %91 = phi float [ %39, %.preheader613 ], [ %183, %130 ]
+  %92 = phi float [ %37, %.preheader613 ], [ %178, %130 ]
+  %93 = phi float [ %18, %.preheader613 ], [ %166, %130 ]
+  %94 = phi float [ %16, %.preheader613 ], [ %161, %130 ]
+  %95 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %96 = shl i32 %95, 8
+  %97 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %98 = shl i32 %97, 2
+  %99 = add i32 %96, %98
+  %100 = sext i32 %99 to i64
+  %101 = getelementptr inbounds float* %Qr, i64 %100
+  store float %94, float* %101, align 4, !tbaa !7
+  %102 = getelementptr inbounds float* %Qi, i64 %100
+  store float %93, float* %102, align 4, !tbaa !7
+  %103 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %104 = shl i32 %103, 8
+  %105 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %106 = shl i32 %105, 2
+  %107 = or i32 %104, 1
+  %108 = add i32 %107, %106
+  %109 = sext i32 %108 to i64
+  %110 = getelementptr inbounds float* %Qr, i64 %109
+  store float %92, float* %110, align 4, !tbaa !7
+  %111 = getelementptr inbounds float* %Qi, i64 %109
+  store float %91, float* %111, align 4, !tbaa !7
+  %112 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %113 = shl i32 %112, 8
+  %114 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %115 = shl i32 %114, 2
+  %116 = or i32 %113, 2
+  %117 = add i32 %116, %115
+  %118 = sext i32 %117 to i64
+  %119 = getelementptr inbounds float* %Qr, i64 %118
+  store float %90, float* %119, align 4, !tbaa !7
+  %120 = getelementptr inbounds float* %Qi, i64 %118
+  store float %89, float* %120, align 4, !tbaa !7
+  %121 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6
+  %122 = shl i32 %121, 8
+  %123 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6
+  %124 = shl i32 %123, 2
+  %125 = or i32 %122, 3
+  %126 = add i32 %125, %124
+  %127 = sext i32 %126 to i64
+  %128 = getelementptr inbounds float* %Qr, i64 %127
+  store float %88, float* %128, align 4, !tbaa !7
+  %129 = getelementptr inbounds float* %Qi, i64 %127
+  store float %87, float* %129, align 4, !tbaa !7
+  ret void
+
+; <label>:130                                     ; preds = %._crit_edge, %.lr.ph
+  %131 = phi float [ %79, %.lr.ph ], [ %212, %._crit_edge ]
+  %132 = phi float [ %60, %.lr.ph ], [ %200, %._crit_edge ]
+  %133 = phi float [ %58, %.lr.ph ], [ %195, %._crit_edge ]
+  %134 = phi float [ %39, %.lr.ph ], [ %183, %._crit_edge ]
+  %135 = phi float [ %37, %.lr.ph ], [ %178, %._crit_edge ]
+  %136 = phi float [ %18, %.lr.ph ], [ %166, %._crit_edge ]
+  %137 = phi float [ %16, %.lr.ph ], [ %161, %._crit_edge ]
+  %138 = phi float [ %13, %.lr.ph ], [ %.pre14, %._crit_edge ]
+  %139 = phi float [ %10, %.lr.ph ], [ %.pre, %._crit_edge ]
+  %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %._crit_edge ]
+  %140 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 0
+  %141 = load float* %140, align 4, !tbaa !9
+  %142 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 1
+  %143 = load float* %142, align 4, !tbaa !11
+  %144 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 2
+  %145 = load float* %144, align 4, !tbaa !12
+  %146 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 3
+  %147 = load float* %146, align 4, !tbaa !13
+  %148 = fpext float %147 to double
+  %149 = load float* %8, align 16, !tbaa !7
+  %150 = fmul fast float %141, %149
+  %151 = fmul fast float %143, %139
+  %152 = fadd fast float %150, %151
+  %153 = fmul fast float %145, %138
+  %154 = fadd fast float %152, %153
+  %155 = fmul fast float %154, 0x401921FB60000000
+  %156 = fpext float %155 to double
+  %157 = tail call double @cos(double %156) #9
+  %158 = fmul fast double %148, %157
+  %159 = fpext float %137 to double
+  %160 = fadd fast double %158, %159
+  %161 = fptrunc double %160 to float
+  %162 = tail call double @sin(double %156) #9
+  %163 = fmul fast double %148, %162
+  %164 = fpext float %136 to double
+  %165 = fadd fast double %163, %164
+  %166 = fptrunc double %165 to float
+  store float %166, float* %19, align 16, !tbaa !7
+  %167 = fmul fast float %141, %28
+  %168 = fmul fast float %143, %31
+  %169 = fadd fast float %167, %168
+  %170 = fmul fast float %145, %34
+  %171 = fadd fast float %169, %170
+  %172 = fmul fast float %171, 0x401921FB60000000
+  %173 = fpext float %172 to double
+  %174 = tail call double @cos(double %173) #9
+  %175 = fmul fast double %148, %174
+  %176 = fpext float %135 to double
+  %177 = fadd fast double %175, %176
+  %178 = fptrunc double %177 to float
+  %179 = tail call double @sin(double %173) #9
+  %180 = fmul fast double %148, %179
+  %181 = fpext float %134 to double
+  %182 = fadd fast double %180, %181
+  %183 = fptrunc double %182 to float
+  store float %183, float* %40, align 4, !tbaa !7
+  %184 = fmul fast float %141, %49
+  %185 = fmul fast float %143, %52
+  %186 = fadd fast float %184, %185
+  %187 = fmul fast float %145, %55
+  %188 = fadd fast float %186, %187
+  %189 = fmul fast float %188, 0x401921FB60000000
+  %190 = fpext float %189 to double
+  %191 = tail call double @cos(double %190) #9
+  %192 = fmul fast double %148, %191
+  %193 = fpext float %133 to double
+  %194 = fadd fast double %192, %193
+  %195 = fptrunc double %194 to float
+  %196 = tail call double @sin(double %190) #9
+  %197 = fmul fast double %148, %196
+  %198 = fpext float %132 to double
+  %199 = fadd fast double %197, %198
+  %200 = fptrunc double %199 to float
+  store float %200, float* %61, align 8, !tbaa !7
+  %201 = fmul fast float %141, %70
+  %202 = fmul fast float %143, %73
+  %203 = fadd fast float %201, %202
+  %204 = fmul fast float %145, %76
+  %205 = fadd fast float %203, %204
+  %206 = fmul fast float %205, 0x401921FB60000000
+  %207 = fpext float %206 to double
+  %208 = tail call double @cos(double %207) #9
+  %209 = fmul fast double %148, %208
+  %210 = fpext float %131 to double
+  %211 = fadd fast double %209, %210
+  %212 = fptrunc double %211 to float
+  %213 = tail call double @sin(double %207) #9
+  %214 = fmul fast double %148, %213
+  %215 = load float* %82, align 4, !tbaa !7
+  %216 = fpext float %215 to double
+  %217 = fadd fast double %214, %216
+  %218 = fptrunc double %217 to float
+  store float %218, float* %82, align 4, !tbaa !7
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %86
+  br i1 %exitcond, label %.preheader, label %._crit_edge
+
+._crit_edge:                                      ; preds = %130
+  %.pre = load float* %11, align 16, !tbaa !7
+  %.pre14 = load float* %14, align 16, !tbaa !7
+  br label %130
+}
+
+declare i32 @get_group_id(...) #4
+
+declare i32 @get_local_id(...) #4
+
+; Function Attrs: nounwind readnone
+declare double @cos(double) #5
+
+; Function Attrs: nounwind readnone
+declare double @sin(double) #5
+
+; Function Attrs: noinline nounwind uwtable
+define void @computeQ(i32 %numK, i32 %numX, float* nocapture readonly %x, float* nocapture readonly %y, float* nocapture readonly %z, %struct.kValues* nocapture readonly %kVals, float* nocapture %Qr, float* nocapture %Qi) #3 {
+  %1 = sdiv i32 %numK, 1024
+  %2 = and i32 %numK, 1023
+  %not. = icmp ne i32 %2, 0
+  %3 = zext i1 %not. to i32
+  %. = add i32 %1, %3
+  %4 = sdiv i32 %numX, 256
+  %5 = and i32 %numX, 255
+  %not.1 = icmp ne i32 %5, 0
+  %6 = zext i1 %not.1 to i32
+  %QBlocks.0 = add nsw i32 %4, %6
+  %7 = shl nsw i32 %QBlocks.0, 8
+  %8 = sdiv i32 %7, 4
+  %9 = sext i32 %8 to i64
+  %10 = icmp sgt i32 %., 0
+  br i1 %10, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %11 = trunc i64 %indvars.iv to i32
+  %12 = shl nsw i32 %11, 10
+  %13 = sext i32 %12 to i64
+  %14 = getelementptr inbounds %struct.kValues* %kVals, i64 %13
+  tail call void @computeQ_kernel(i32 %numK, i32 %12, float* %x, i64 undef, float* %y, i64 undef, float* %z, i64 undef, float* %Qr, i64 undef, float* %Qi, i64 undef, %struct.kValues* %14, i64 undef)
+  %15 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str4, i64 0, i64 0), i64 %9, i64 64) #6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %.
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #1
+
+; Function Attrs: nounwind uwtable
+define void @createDataStructsCPU(i32 %numK, i32 %numX, float** nocapture %phiMag, float** nocapture %Qr, float** nocapture %Qi) #0 {
+  %1 = sext i32 %numK to i64
+  %2 = shl nsw i64 %1, 2
+  %3 = tail call noalias i8* @memalign(i64 16, i64 %2) #6
+  %4 = bitcast i8* %3 to float*
+  store float* %4, float** %phiMag, align 8, !tbaa !1
+  %5 = sext i32 %numX to i64
+  %6 = shl nsw i64 %5, 2
+  %7 = tail call noalias i8* @memalign(i64 16, i64 %6) #6
+  %8 = bitcast i8* %7 to float*
+  store float* %8, float** %Qr, align 8, !tbaa !1
+  %9 = tail call noalias i8* @memalign(i64 16, i64 %6) #6
+  %10 = bitcast i8* %9 to float*
+  store float* %10, float** %Qi, align 8, !tbaa !1
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** %argv) #0 {
+  %1 = alloca i32, align 4
+  %numX = alloca i32, align 4
+  %original_numK = alloca i32, align 4
+  %kx = alloca float*, align 8
+  %ky = alloca float*, align 8
+  %kz = alloca float*, align 8
+  %x = alloca float*, align 8
+  %y = alloca float*, align 8
+  %z = alloca float*, align 8
+  %phiR = alloca float*, align 8
+  %phiI = alloca float*, align 8
+  %timers = alloca %struct.pb_TimerSet, align 8
+  %end = alloca i8*, align 8
+  store i32 %argc, i32* %1, align 4, !tbaa !5
+  %2 = bitcast %struct.pb_TimerSet* %timers to i8*
+  call void @llvm.lifetime.start(i64 288, i8* %2) #6
+  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #6
+  %3 = call %struct.pb_Parameters* @pb_ReadParameters(i32* %1, i8** %argv) #6
+  %4 = getelementptr inbounds %struct.pb_Parameters* %3, i64 0, i32 1
+  %5 = load i8*** %4, align 8, !tbaa !14
+  %6 = load i8** %5, align 8, !tbaa !1
+  %7 = icmp eq i8* %6, null
+  br i1 %7, label %12, label %8
+
+; <label>:8                                       ; preds = %0
+  %9 = getelementptr inbounds i8** %5, i64 1
+  %10 = load i8** %9, align 8, !tbaa !1
+  %11 = icmp eq i8* %10, null
+  br i1 %11, label %15, label %12
+
+; <label>:12                                      ; preds = %8, %0
+  %13 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1
+  %14 = call i64 @fwrite(i8* getelementptr inbounds ([30 x i8]* @.str5, i64 0, i64 0), i64 29, i64 1, %struct._IO_FILE* %13) #7
+  call void @exit(i32 -1) #8
+  unreachable
+
+; <label>:15                                      ; preds = %8
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #6
+  %16 = load i8*** %4, align 8, !tbaa !14
+  %17 = load i8** %16, align 8, !tbaa !1
+  call void @inputData(i8* %17, i32* %original_numK, i32* %numX, float** %kx, float** %ky, float** %kz, float** %x, float** %y, float** %z, float** %phiR, float** %phiI)
+  %18 = load i32* %1, align 4, !tbaa !5
+  %19 = icmp slt i32 %18, 2
+  br i1 %19, label %20, label %22
+
+; <label>:20                                      ; preds = %15
+  %21 = load i32* %original_numK, align 4, !tbaa !5
+  br label %37
+
+; <label>:22                                      ; preds = %15
+  %23 = getelementptr inbounds i8** %argv, i64 1
+  %24 = load i8** %23, align 8, !tbaa !1
+  %25 = call i64 @strtol(i8* %24, i8** %end, i32 10) #6
+  %26 = trunc i64 %25 to i32
+  %27 = load i8** %end, align 8, !tbaa !1
+  %28 = load i8** %23, align 8, !tbaa !1
+  %29 = icmp eq i8* %27, %28
+  br i1 %29, label %30, label %33
+
+; <label>:30                                      ; preds = %22
+  %31 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1
+  %32 = call i64 @fwrite(i8* getelementptr inbounds ([32 x i8]* @.str6, i64 0, i64 0), i64 31, i64 1, %struct._IO_FILE* %31) #7
+  call void @exit(i32 -1) #8
+  unreachable
+
+; <label>:33                                      ; preds = %22
+  %34 = load i32* %original_numK, align 4, !tbaa !5
+  %35 = icmp slt i32 %26, %34
+  %36 = select i1 %35, i32 %26, i32 %34
+  br label %37
+
+; <label>:37                                      ; preds = %33, %20
+  %38 = phi i32 [ %21, %20 ], [ %34, %33 ]
+  %numK.0 = phi i32 [ %21, %20 ], [ %36, %33 ]
+  %39 = load i32* %numX, align 4, !tbaa !5
+  %40 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([65 x i8]* @.str7, i64 0, i64 0), i32 %39, i32 %38, i32 %numK.0) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6
+  %41 = sext i32 %numK.0 to i64
+  %42 = shl nsw i64 %41, 2
+  %43 = call noalias i8* @memalign(i64 16, i64 %42) #6
+  %44 = bitcast i8* %43 to float*
+  %45 = sext i32 %39 to i64
+  %46 = shl nsw i64 %45, 2
+  %47 = call noalias i8* @memalign(i64 16, i64 %46) #6
+  %48 = bitcast i8* %47 to float*
+  %49 = call noalias i8* @memalign(i64 16, i64 %46) #6
+  %50 = bitcast i8* %49 to float*
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 2) #6
+  %51 = load float** %phiR, align 8, !tbaa !1
+  %52 = load float** %phiI, align 8, !tbaa !1
+  call void @computePhiMag(i32 %numK.0, float* %51, float* %52, float* %44)
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6
+  %53 = call noalias i8* @calloc(i64 %41, i64 16) #6
+  %54 = bitcast i8* %53 to %struct.kValues*
+  %55 = icmp sgt i32 %numK.0, 0
+  br i1 %55, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %37
+  %56 = load float** %kx, align 8, !tbaa !1
+  %57 = load float** %ky, align 8, !tbaa !1
+  %58 = load float** %kz, align 8, !tbaa !1
+  br label %59
+
+; <label>:59                                      ; preds = %59, %.lr.ph
+  %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %59 ]
+  %60 = getelementptr inbounds float* %56, i64 %indvars.iv
+  %61 = load float* %60, align 4, !tbaa !7
+  %62 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 0
+  store float %61, float* %62, align 4, !tbaa !9
+  %63 = getelementptr inbounds float* %57, i64 %indvars.iv
+  %64 = load float* %63, align 4, !tbaa !7
+  %65 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 1
+  store float %64, float* %65, align 4, !tbaa !11
+  %66 = getelementptr inbounds float* %58, i64 %indvars.iv
+  %67 = load float* %66, align 4, !tbaa !7
+  %68 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 2
+  store float %67, float* %68, align 4, !tbaa !12
+  %69 = getelementptr inbounds float* %44, i64 %indvars.iv
+  %70 = load float* %69, align 4, !tbaa !7
+  %71 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 3
+  store float %70, float* %71, align 4, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %numK.0
+  br i1 %exitcond, label %._crit_edge, label %59
+
+._crit_edge:                                      ; preds = %59, %37
+  call void @free(i8* %43) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 2) #6
+  %72 = load float** %x, align 8, !tbaa !1
+  %73 = load float** %y, align 8, !tbaa !1
+  %74 = load float** %z, align 8, !tbaa !1
+  call void @computeQ(i32 %numK.0, i32 %39, float* %72, float* %73, float* %74, %struct.kValues* %54, float* %48, float* %50)
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6
+  %75 = getelementptr inbounds %struct.pb_Parameters* %3, i64 0, i32 0
+  %76 = load i8** %75, align 8, !tbaa !16
+  %77 = icmp eq i8* %76, null
+  br i1 %77, label %80, label %78
+
+; <label>:78                                      ; preds = %._crit_edge
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #6
+  %79 = load i8** %75, align 8, !tbaa !16
+  call void @outputData(i8* %79, float* %48, float* %50, i32 %39)
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6
+  br label %80
+
+; <label>:80                                      ; preds = %._crit_edge, %78
+  %81 = load float** %kx, align 8, !tbaa !1
+  %82 = bitcast float* %81 to i8*
+  call void @free(i8* %82) #6
+  %83 = load float** %ky, align 8, !tbaa !1
+  %84 = bitcast float* %83 to i8*
+  call void @free(i8* %84) #6
+  %85 = load float** %kz, align 8, !tbaa !1
+  %86 = bitcast float* %85 to i8*
+  call void @free(i8* %86) #6
+  %87 = bitcast float* %72 to i8*
+  call void @free(i8* %87) #6
+  %88 = bitcast float* %73 to i8*
+  call void @free(i8* %88) #6
+  %89 = bitcast float* %74 to i8*
+  call void @free(i8* %89) #6
+  %90 = bitcast float* %51 to i8*
+  call void @free(i8* %90) #6
+  %91 = bitcast float* %52 to i8*
+  call void @free(i8* %91) #6
+  call void @free(i8* %53) #6
+  call void @free(i8* %47) #6
+  call void @free(i8* %49) #6
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #6
+  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #6
+  call void @pb_FreeParameters(%struct.pb_Parameters* %3) #6
+  call void @llvm.lifetime.end(i64 288, i8* %2) #6
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #6
+
+declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #4
+
+declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #4
+
+declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #4
+
+; Function Attrs: nounwind
+declare i64 @strtol(i8* readonly, i8** nocapture, i32) #1
+
+; Function Attrs: nounwind
+declare noalias i8* @calloc(i64, i64) #1
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #1
+
+declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #4
+
+declare void @pb_FreeParameters(%struct.pb_Parameters*) #4
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #6
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #5 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+attributes #7 = { cold nounwind }
+attributes #8 = { noreturn nounwind }
+attributes #9 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4.2 (tags/RELEASE_34/dot2-final)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}
+!7 = metadata !{metadata !8, metadata !8, i64 0}
+!8 = metadata !{metadata !"float", metadata !3, i64 0}
+!9 = metadata !{metadata !10, metadata !8, i64 0}
+!10 = metadata !{metadata !"kValues", metadata !8, i64 0, metadata !8, i64 4, metadata !8, i64 8, metadata !8, i64 12}
+!11 = metadata !{metadata !10, metadata !8, i64 4}
+!12 = metadata !{metadata !10, metadata !8, i64 8}
+!13 = metadata !{metadata !10, metadata !8, i64 12}
+!14 = metadata !{metadata !15, metadata !2, i64 8}
+!15 = metadata !{metadata !"pb_Parameters", metadata !2, i64 0, metadata !2, i64 8}
+!16 = metadata !{metadata !15, metadata !2, i64 0}
-- 
GitLab