From 2251d376559991b1f4bc60fa4372a938bbf2474f Mon Sep 17 00:00:00 2001 From: Prakalp Srivastava <psrivas2@illinois.edu> Date: Sat, 29 Nov 2014 00:49:41 +0000 Subject: [PATCH] Creating a new test case visc_mri-q. Initial commit --- .../benchmarks/mri-q/src/visc/computeQ.c | 103 ++- .../parboil/benchmarks/mri-q/src/visc/main.c | 200 ++++- .../benchmarks/mri-q/src/visc/visc_mri-q.ll | 740 ++++++++++++++++++ 3 files changed, 1006 insertions(+), 37 deletions(-) create mode 100644 llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c index fe58267d66..6b106f2515 100644 --- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c +++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/computeQ.c @@ -8,23 +8,88 @@ #include <stdio.h> #include <malloc.h> -#include <CL/cl.h> -#include "ocl.h" +#include <math.h> + #include "macros.h" #include "computeQ.h" #define NC 4 +void __attribute__ ((noinline)) computePhiMag_GPU_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) { + int indexK = get_global_id(0); + if (indexK < numK) { + float real = phiR[indexK]; + float imag = phiI[indexK]; + phiMag[indexK] = real*real + imag*imag; + } +} + + + void computePhiMag_GPU(int numK, float* phiR, float* phiI, float* phiMag) { int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK; if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK) phiMagBlocks++; - + size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK; size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK; - ComputePhiMag_GPU(phiR, phiI, phiMag, numK); + size_t bytes_phi = numK * sizeof(float); + + computePhiMag_GPU_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK); +} + +void __attribute__ ((noinline)) computeQ_GPU_kernel(int numK, int kGlobalIndex, + float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z, + float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) +{ + + float sX[NC]; + float sY[NC]; + float sZ[NC]; + float sQr[NC]; + float sQi[NC]; + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx; + + sX[tx] = x[xIndex]; + sY[tx] = y[xIndex]; + sZ[tx] = z[xIndex]; + sQr[tx] = Qr[xIndex]; + sQi[tx] = Qi[xIndex]; + } + + // Loop over all elements of K in constant mem to compute a partial value + // for X. + int kIndex = 0; + for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK); + kIndex ++, kGlobalIndex ++) { + float kx = ck[kIndex].Kx; + float ky = ck[kIndex].Ky; + float kz = ck[kIndex].Kz; + float pm = ck[kIndex].PhiMag; + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + float expArg = PIx2 * + (kx * sX[tx] + + ky * sY[tx] + + kz * sZ[tx]); + sQr[tx] += pm * cos(expArg); + sQi[tx] += pm * sin(expArg); + } + } + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx; + Qr[xIndex] = sQr[tx]; + Qi[xIndex] = sQi[tx]; + } + } void computeQ_GPU (int numK,int numX, @@ -43,9 +108,8 @@ void computeQ_GPU (int numK,int numX, size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC; size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC; - cl_int clStatus; - cl_mem ck; - ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus); + //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus); + // size in bytes = numElems*sizeof(struct kValues)) int QGrid; for (QGrid = 0; QGrid < QGrids; QGrid++) { @@ -53,28 +117,11 @@ void computeQ_GPU (int numK,int numX, int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID; struct kValues* kValsTile = kVals + QGridBase; int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase); + size_t bytes_x = numX * sizeof(float); + size_t bytes_kValTile = numElems*sizeof(struct kValues); - clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL); - CHECK_ERROR("clEnqueueWriteBuffer") - - ComputeQ_GPU(numK, QGridBase, x, y, z, Qr, Qi, kValsTile); - - clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK); - clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase); - clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d); - clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d); - clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d); - clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d); - clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d); - clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck); - CHECK_ERROR("clSetKernelArg") - - - - printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock); - - clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL); - CHECK_ERROR("clEnqueueNDRangeKernel") + computeQ_GPU_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile); + printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock); } } diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c index 03e388bb62..d3a5c71666 100644 --- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c +++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/main.c @@ -25,14 +25,196 @@ */ #include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <math.h> #include <sys/time.h> #include <parboil.h> -#include <CL/cl.h> - -#include "ocl.h" -#include "file.h" +#include <endian.h> +#include <inttypes.h> #include "macros.h" -#include "computeQ.h" + +#define NC 4 +#if __BYTE_ORDER != __LITTLE_ENDIAN +# error "File I/O is not implemented for this system: wrong endianness." +#endif + +void inputData(char* fName, int* _numK, int* _numX, + float** kx, float** ky, float** kz, + float** x, float** y, float** z, + float** phiR, float** phiI) +{ + int numK, numX; + FILE* fid = fopen(fName, "r"); + + if (fid == NULL) + { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + fread (&numK, sizeof (int), 1, fid); + *_numK = numK; + fread (&numX, sizeof (int), 1, fid); + *_numX = numX; + *kx = (float *) memalign(16, numK * sizeof (float)); + fread (*kx, sizeof (float), numK, fid); + *ky = (float *) memalign(16, numK * sizeof (float)); + fread (*ky, sizeof (float), numK, fid); + *kz = (float *) memalign(16, numK * sizeof (float)); + fread (*kz, sizeof (float), numK, fid); + *x = (float *) memalign(16, numX * sizeof (float)); + fread (*x, sizeof (float), numX, fid); + *y = (float *) memalign(16, numX * sizeof (float)); + fread (*y, sizeof (float), numX, fid); + *z = (float *) memalign(16, numX * sizeof (float)); + fread (*z, sizeof (float), numX, fid); + *phiR = (float *) memalign(16, numK * sizeof (float)); + fread (*phiR, sizeof (float), numK, fid); + *phiI = (float *) memalign(16, numK * sizeof (float)); + fread (*phiI, sizeof (float), numK, fid); + fclose (fid); +} + +void outputData(char* fName, float* outR, float* outI, int numX) +{ + FILE* fid = fopen(fName, "w"); + uint32_t tmp32; + + if (fid == NULL) + { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + + /* Write the data size */ + tmp32 = numX; + fwrite(&tmp32, sizeof(uint32_t), 1, fid); + + /* Write the reconstructed data */ + fwrite (outR, sizeof (float), numX, fid); + fwrite (outI, sizeof (float), numX, fid); + fclose (fid); +} + + +void __attribute__ ((noinline)) computePhiMag_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) { + int indexK = get_global_id(0); + if (indexK < numK) { + float real = phiR[indexK]; + float imag = phiI[indexK]; + phiMag[indexK] = real*real + imag*imag; + } +} + + + +void __attribute__ ((noinline)) computePhiMag(int numK, float* phiR, float* phiI, float* phiMag) +{ + int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK; + if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK) + phiMagBlocks++; + + size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK; + size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK; + + size_t bytes_phi = numK * sizeof(float); + + computePhiMag_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK); +} + +void __attribute__ ((noinline)) computeQ_kernel(int numK, int kGlobalIndex, + float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z, + float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck) +{ + + float sX[NC]; + float sY[NC]; + float sZ[NC]; + float sQr[NC]; + float sQi[NC]; + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx; + + sX[tx] = x[xIndex]; + sY[tx] = y[xIndex]; + sZ[tx] = z[xIndex]; + sQr[tx] = Qr[xIndex]; + sQi[tx] = Qi[xIndex]; + } + + // Loop over all elements of K in constant mem to compute a partial value + // for X. + int kIndex = 0; + for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK); + kIndex ++, kGlobalIndex ++) { + float kx = ck[kIndex].Kx; + float ky = ck[kIndex].Ky; + float kz = ck[kIndex].Kz; + float pm = ck[kIndex].PhiMag; + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + float expArg = PIx2 * + (kx * sX[tx] + + ky * sY[tx] + + kz * sZ[tx]); + sQr[tx] += pm * cos(expArg); + sQi[tx] += pm * sin(expArg); + } + } + + #pragma unroll + for (int tx = 0; tx < NC; tx++) { + int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx; + Qr[xIndex] = sQr[tx]; + Qi[xIndex] = sQi[tx]; + } + +} + +void __attribute__ ((noinline)) computeQ (int numK,int numX, + float* x, float* y, float* z, + struct kValues* kVals, + float* Qr, float* Qi + ) +{ + int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID; + if (numK % KERNEL_Q_K_ELEMS_PER_GRID) + QGrids++; + int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK; + if (numX % KERNEL_Q_THREADS_PER_BLOCK) + QBlocks++; + + size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC; + size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC; + + //ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus); + // size in bytes = numElems*sizeof(struct kValues)) + + int QGrid; + for (QGrid = 0; QGrid < QGrids; QGrid++) { + // Put the tile of K values into constant mem + int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID; + struct kValues* kValsTile = kVals + QGridBase; + int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase); + size_t bytes_x = numX * sizeof(float); + size_t bytes_kValTile = numElems*sizeof(struct kValues); + + computeQ_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile); + printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock); + } +} + +void createDataStructsCPU(int numK, int numX, float** phiMag, + float** Qr, float** Qi) +{ + *phiMag = (float* ) memalign(16, numK * sizeof(float)); + *Qr = (float*) memalign(16, numX * sizeof (float)); + *Qi = (float*) memalign(16, numX * sizeof (float)); +} + int main (int argc, char *argv[]) { @@ -96,10 +278,10 @@ main (int argc, char *argv[]) { /* GPU section 1 (precompute PhiMag) */ { pb_SwitchToTimer(&timers, pb_TimerID_COPY); - + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - computePhiMag_GPU(numK, phiR, phiI, phiMag); + computePhiMag(numK, phiR, phiI, phiMag); pb_SwitchToTimer(&timers, pb_TimerID_COPY); @@ -118,14 +300,14 @@ main (int argc, char *argv[]) { } free(phiMag); - + /* GPU section 2 */ { pb_SwitchToTimer(&timers, pb_TimerID_COPY); pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - computeQ_GPU(numK, numX, x, y, z, kVals, Qr, Qi); + computeQ(numK, numX, x, y, z, kVals, Qr, Qi); pb_SwitchToTimer(&timers, pb_TimerID_COPY); diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll new file mode 100644 index 0000000000..0f56a342f1 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll @@ -0,0 +1,740 @@ +; ModuleID = 'main.c' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-redhat-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } +%struct.kValues = type { float, float, float, float } +%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [8 x %struct.pb_Timer], [8 x %struct.pb_SubTimerList*] } +%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* } +%struct.pb_Timer = type { i32, i64, i64 } +%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* } +%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* } +%struct.pb_Parameters = type { i8*, i8** } + +@.str = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@stderr = external global %struct._IO_FILE* +@.str1 = private unnamed_addr constant [24 x i8] c"Cannot open input file\0A\00", align 1 +@.str2 = private unnamed_addr constant [2 x i8] c"w\00", align 1 +@.str3 = private unnamed_addr constant [25 x i8] c"Cannot open output file\0A\00", align 1 +@.str4 = private unnamed_addr constant [23 x i8] c"Grid: %lu, Block: %lu\0A\00", align 1 +@.str5 = private unnamed_addr constant [30 x i8] c"Expecting one input filename\0A\00", align 1 +@.str6 = private unnamed_addr constant [32 x i8] c"Expecting an integer parameter\0A\00", align 1 +@.str7 = private unnamed_addr constant [65 x i8] c"%d pixels in output; %d samples in trajectory; using %d samples\0A\00", align 1 + +; Function Attrs: nounwind uwtable +define void @inputData(i8* nocapture readonly %fName, i32* nocapture %_numK, i32* nocapture %_numX, float** nocapture %kx, float** nocapture %ky, float** nocapture %kz, float** nocapture %x, float** nocapture %y, float** nocapture %z, float** nocapture %phiR, float** nocapture %phiI) #0 { + %numK = alloca i32, align 4 + %numX = alloca i32, align 4 + %1 = tail call %struct._IO_FILE* @fopen(i8* %fName, i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) #6 + %2 = icmp eq %struct._IO_FILE* %1, null + br i1 %2, label %3, label %6 + +; <label>:3 ; preds = %0 + %4 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1 + %5 = tail call i64 @fwrite(i8* getelementptr inbounds ([24 x i8]* @.str1, i64 0, i64 0), i64 23, i64 1, %struct._IO_FILE* %4) #7 + tail call void @exit(i32 -1) #8 + unreachable + +; <label>:6 ; preds = %0 + %7 = bitcast i32* %numK to i8* + %8 = call i64 @fread(i8* %7, i64 4, i64 1, %struct._IO_FILE* %1) #6 + %9 = load i32* %numK, align 4, !tbaa !5 + store i32 %9, i32* %_numK, align 4, !tbaa !5 + %10 = bitcast i32* %numX to i8* + %11 = call i64 @fread(i8* %10, i64 4, i64 1, %struct._IO_FILE* %1) #6 + %12 = load i32* %numX, align 4, !tbaa !5 + store i32 %12, i32* %_numX, align 4, !tbaa !5 + %13 = sext i32 %9 to i64 + %14 = shl nsw i64 %13, 2 + %15 = tail call noalias i8* @memalign(i64 16, i64 %14) #6 + %16 = bitcast i8* %15 to float* + store float* %16, float** %kx, align 8, !tbaa !1 + %17 = tail call i64 @fread(i8* %15, i64 4, i64 %13, %struct._IO_FILE* %1) #6 + %18 = tail call noalias i8* @memalign(i64 16, i64 %14) #6 + %19 = bitcast i8* %18 to float* + store float* %19, float** %ky, align 8, !tbaa !1 + %20 = tail call i64 @fread(i8* %18, i64 4, i64 %13, %struct._IO_FILE* %1) #6 + %21 = tail call noalias i8* @memalign(i64 16, i64 %14) #6 + %22 = bitcast i8* %21 to float* + store float* %22, float** %kz, align 8, !tbaa !1 + %23 = tail call i64 @fread(i8* %21, i64 4, i64 %13, %struct._IO_FILE* %1) #6 + %24 = sext i32 %12 to i64 + %25 = shl nsw i64 %24, 2 + %26 = tail call noalias i8* @memalign(i64 16, i64 %25) #6 + %27 = bitcast i8* %26 to float* + store float* %27, float** %x, align 8, !tbaa !1 + %28 = tail call i64 @fread(i8* %26, i64 4, i64 %24, %struct._IO_FILE* %1) #6 + %29 = tail call noalias i8* @memalign(i64 16, i64 %25) #6 + %30 = bitcast i8* %29 to float* + store float* %30, float** %y, align 8, !tbaa !1 + %31 = tail call i64 @fread(i8* %29, i64 4, i64 %24, %struct._IO_FILE* %1) #6 + %32 = tail call noalias i8* @memalign(i64 16, i64 %25) #6 + %33 = bitcast i8* %32 to float* + store float* %33, float** %z, align 8, !tbaa !1 + %34 = tail call i64 @fread(i8* %32, i64 4, i64 %24, %struct._IO_FILE* %1) #6 + %35 = tail call noalias i8* @memalign(i64 16, i64 %14) #6 + %36 = bitcast i8* %35 to float* + store float* %36, float** %phiR, align 8, !tbaa !1 + %37 = tail call i64 @fread(i8* %35, i64 4, i64 %13, %struct._IO_FILE* %1) #6 + %38 = tail call noalias i8* @memalign(i64 16, i64 %14) #6 + %39 = bitcast i8* %38 to float* + store float* %39, float** %phiI, align 8, !tbaa !1 + %40 = tail call i64 @fread(i8* %38, i64 4, i64 %13, %struct._IO_FILE* %1) #6 + %41 = tail call i32 @fclose(%struct._IO_FILE* %1) #6 + ret void +} + +; Function Attrs: nounwind +declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) #1 + +; Function Attrs: noreturn nounwind +declare void @exit(i32) #2 + +; Function Attrs: nounwind +declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind +declare noalias i8* @memalign(i64, i64) #1 + +; Function Attrs: nounwind +declare i32 @fclose(%struct._IO_FILE* nocapture) #1 + +; Function Attrs: nounwind uwtable +define void @outputData(i8* nocapture readonly %fName, float* nocapture %outR, float* nocapture %outI, i32 %numX) #0 { + %tmp32 = alloca i32, align 4 + %1 = tail call %struct._IO_FILE* @fopen(i8* %fName, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)) #6 + %2 = icmp eq %struct._IO_FILE* %1, null + br i1 %2, label %3, label %6 + +; <label>:3 ; preds = %0 + %4 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1 + %5 = tail call i64 @fwrite(i8* getelementptr inbounds ([25 x i8]* @.str3, i64 0, i64 0), i64 24, i64 1, %struct._IO_FILE* %4) #7 + tail call void @exit(i32 -1) #8 + unreachable + +; <label>:6 ; preds = %0 + store i32 %numX, i32* %tmp32, align 4, !tbaa !5 + %7 = bitcast i32* %tmp32 to i8* + %8 = call i64 @fwrite(i8* %7, i64 4, i64 1, %struct._IO_FILE* %1) #6 + %9 = bitcast float* %outR to i8* + %10 = sext i32 %numX to i64 + %11 = tail call i64 @fwrite(i8* %9, i64 4, i64 %10, %struct._IO_FILE* %1) #6 + %12 = bitcast float* %outI to i8* + %13 = tail call i64 @fwrite(i8* %12, i64 4, i64 %10, %struct._IO_FILE* %1) #6 + %14 = tail call i32 @fclose(%struct._IO_FILE* %1) #6 + ret void +} + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1 + +; Function Attrs: noinline nounwind uwtable +define void @computePhiMag_kernel(float* nocapture readonly %phiR, i64 %bytes_phiR, float* nocapture readonly %phiI, i64 %bytes_phiI, float* nocapture %phiMag, i64 %bytes_phiMag, i32 %numK) #3 { + %1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #6 + %2 = icmp slt i32 %1, %numK + br i1 %2, label %3, label %13 + +; <label>:3 ; preds = %0 + %4 = sext i32 %1 to i64 + %5 = getelementptr inbounds float* %phiR, i64 %4 + %6 = load float* %5, align 4, !tbaa !7 + %7 = getelementptr inbounds float* %phiI, i64 %4 + %8 = load float* %7, align 4, !tbaa !7 + %9 = fmul fast float %6, %6 + %10 = fmul fast float %8, %8 + %11 = fadd fast float %9, %10 + %12 = getelementptr inbounds float* %phiMag, i64 %4 + store float %11, float* %12, align 4, !tbaa !7 + br label %13 + +; <label>:13 ; preds = %3, %0 + ret void +} + +declare i32 @get_global_id(...) #4 + +; Function Attrs: noinline nounwind uwtable +define void @computePhiMag(i32 %numK, float* nocapture readonly %phiR, float* nocapture readonly %phiI, float* nocapture %phiMag) #3 { + tail call void @computePhiMag_kernel(float* %phiR, i64 undef, float* %phiI, i64 undef, float* %phiMag, i64 undef, i32 %numK) + ret void +} + +; Function Attrs: noinline nounwind uwtable +define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* nocapture readonly %x, i64 %bytes_x, float* nocapture readonly %y, i64 %bytes_y, float* nocapture readonly %z, i64 %bytes_z, float* nocapture %Qr, i64 %bytes_Qr, float* nocapture %Qi, i64 %bytes_Qi, %struct.kValues* nocapture readonly %ck, i64 %bytes_ck) #3 { +.preheader613: + %sX = alloca [4 x float], align 16 + %sY = alloca [4 x float], align 16 + %sZ = alloca [4 x float], align 16 + %sQi = alloca [4 x float], align 16 + %0 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %1 = shl i32 %0, 8 + %2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %3 = shl i32 %2, 2 + %4 = add i32 %1, %3 + %5 = sext i32 %4 to i64 + %6 = getelementptr inbounds float* %x, i64 %5 + %7 = load float* %6, align 4, !tbaa !7 + %8 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 0 + store float %7, float* %8, align 16, !tbaa !7 + %9 = getelementptr inbounds float* %y, i64 %5 + %10 = load float* %9, align 4, !tbaa !7 + %11 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 0 + store float %10, float* %11, align 16, !tbaa !7 + %12 = getelementptr inbounds float* %z, i64 %5 + %13 = load float* %12, align 4, !tbaa !7 + %14 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 0 + store float %13, float* %14, align 16, !tbaa !7 + %15 = getelementptr inbounds float* %Qr, i64 %5 + %16 = load float* %15, align 4, !tbaa !7 + %17 = getelementptr inbounds float* %Qi, i64 %5 + %18 = load float* %17, align 4, !tbaa !7 + %19 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 0 + store float %18, float* %19, align 16, !tbaa !7 + %20 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %21 = shl i32 %20, 8 + %22 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %23 = shl i32 %22, 2 + %24 = or i32 %21, 1 + %25 = add i32 %24, %23 + %26 = sext i32 %25 to i64 + %27 = getelementptr inbounds float* %x, i64 %26 + %28 = load float* %27, align 4, !tbaa !7 + %29 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 1 + store float %28, float* %29, align 4, !tbaa !7 + %30 = getelementptr inbounds float* %y, i64 %26 + %31 = load float* %30, align 4, !tbaa !7 + %32 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 1 + store float %31, float* %32, align 4, !tbaa !7 + %33 = getelementptr inbounds float* %z, i64 %26 + %34 = load float* %33, align 4, !tbaa !7 + %35 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 1 + store float %34, float* %35, align 4, !tbaa !7 + %36 = getelementptr inbounds float* %Qr, i64 %26 + %37 = load float* %36, align 4, !tbaa !7 + %38 = getelementptr inbounds float* %Qi, i64 %26 + %39 = load float* %38, align 4, !tbaa !7 + %40 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 1 + store float %39, float* %40, align 4, !tbaa !7 + %41 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %42 = shl i32 %41, 8 + %43 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %44 = shl i32 %43, 2 + %45 = or i32 %42, 2 + %46 = add i32 %45, %44 + %47 = sext i32 %46 to i64 + %48 = getelementptr inbounds float* %x, i64 %47 + %49 = load float* %48, align 4, !tbaa !7 + %50 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 2 + store float %49, float* %50, align 8, !tbaa !7 + %51 = getelementptr inbounds float* %y, i64 %47 + %52 = load float* %51, align 4, !tbaa !7 + %53 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 2 + store float %52, float* %53, align 8, !tbaa !7 + %54 = getelementptr inbounds float* %z, i64 %47 + %55 = load float* %54, align 4, !tbaa !7 + %56 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 2 + store float %55, float* %56, align 8, !tbaa !7 + %57 = getelementptr inbounds float* %Qr, i64 %47 + %58 = load float* %57, align 4, !tbaa !7 + %59 = getelementptr inbounds float* %Qi, i64 %47 + %60 = load float* %59, align 4, !tbaa !7 + %61 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 2 + store float %60, float* %61, align 8, !tbaa !7 + %62 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %63 = shl i32 %62, 8 + %64 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %65 = shl i32 %64, 2 + %66 = or i32 %63, 3 + %67 = add i32 %66, %65 + %68 = sext i32 %67 to i64 + %69 = getelementptr inbounds float* %x, i64 %68 + %70 = load float* %69, align 4, !tbaa !7 + %71 = getelementptr inbounds [4 x float]* %sX, i64 0, i64 3 + store float %70, float* %71, align 4, !tbaa !7 + %72 = getelementptr inbounds float* %y, i64 %68 + %73 = load float* %72, align 4, !tbaa !7 + %74 = getelementptr inbounds [4 x float]* %sY, i64 0, i64 3 + store float %73, float* %74, align 4, !tbaa !7 + %75 = getelementptr inbounds float* %z, i64 %68 + %76 = load float* %75, align 4, !tbaa !7 + %77 = getelementptr inbounds [4 x float]* %sZ, i64 0, i64 3 + store float %76, float* %77, align 4, !tbaa !7 + %78 = getelementptr inbounds float* %Qr, i64 %68 + %79 = load float* %78, align 4, !tbaa !7 + %80 = getelementptr inbounds float* %Qi, i64 %68 + %81 = load float* %80, align 4, !tbaa !7 + %82 = getelementptr inbounds [4 x float]* %sQi, i64 0, i64 3 + store float %81, float* %82, align 4, !tbaa !7 + %83 = icmp slt i32 %kGlobalIndex, %numK + br i1 %83, label %.lr.ph, label %.preheader + +.lr.ph: ; preds = %.preheader613 + %84 = sub i32 %kGlobalIndex, %numK + %85 = icmp ugt i32 %84, -1024 + %.op = sub i32 0, %84 + %86 = select i1 %85, i32 %.op, i32 1024 + br label %130 + +.preheader: ; preds = %130, %.preheader613 + %87 = phi float [ %81, %.preheader613 ], [ %218, %130 ] + %88 = phi float [ %79, %.preheader613 ], [ %212, %130 ] + %89 = phi float [ %60, %.preheader613 ], [ %200, %130 ] + %90 = phi float [ %58, %.preheader613 ], [ %195, %130 ] + %91 = phi float [ %39, %.preheader613 ], [ %183, %130 ] + %92 = phi float [ %37, %.preheader613 ], [ %178, %130 ] + %93 = phi float [ %18, %.preheader613 ], [ %166, %130 ] + %94 = phi float [ %16, %.preheader613 ], [ %161, %130 ] + %95 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %96 = shl i32 %95, 8 + %97 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %98 = shl i32 %97, 2 + %99 = add i32 %96, %98 + %100 = sext i32 %99 to i64 + %101 = getelementptr inbounds float* %Qr, i64 %100 + store float %94, float* %101, align 4, !tbaa !7 + %102 = getelementptr inbounds float* %Qi, i64 %100 + store float %93, float* %102, align 4, !tbaa !7 + %103 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %104 = shl i32 %103, 8 + %105 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %106 = shl i32 %105, 2 + %107 = or i32 %104, 1 + %108 = add i32 %107, %106 + %109 = sext i32 %108 to i64 + %110 = getelementptr inbounds float* %Qr, i64 %109 + store float %92, float* %110, align 4, !tbaa !7 + %111 = getelementptr inbounds float* %Qi, i64 %109 + store float %91, float* %111, align 4, !tbaa !7 + %112 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %113 = shl i32 %112, 8 + %114 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %115 = shl i32 %114, 2 + %116 = or i32 %113, 2 + %117 = add i32 %116, %115 + %118 = sext i32 %117 to i64 + %119 = getelementptr inbounds float* %Qr, i64 %118 + store float %90, float* %119, align 4, !tbaa !7 + %120 = getelementptr inbounds float* %Qi, i64 %118 + store float %89, float* %120, align 4, !tbaa !7 + %121 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) #6 + %122 = shl i32 %121, 8 + %123 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) #6 + %124 = shl i32 %123, 2 + %125 = or i32 %122, 3 + %126 = add i32 %125, %124 + %127 = sext i32 %126 to i64 + %128 = getelementptr inbounds float* %Qr, i64 %127 + store float %88, float* %128, align 4, !tbaa !7 + %129 = getelementptr inbounds float* %Qi, i64 %127 + store float %87, float* %129, align 4, !tbaa !7 + ret void + +; <label>:130 ; preds = %._crit_edge, %.lr.ph + %131 = phi float [ %79, %.lr.ph ], [ %212, %._crit_edge ] + %132 = phi float [ %60, %.lr.ph ], [ %200, %._crit_edge ] + %133 = phi float [ %58, %.lr.ph ], [ %195, %._crit_edge ] + %134 = phi float [ %39, %.lr.ph ], [ %183, %._crit_edge ] + %135 = phi float [ %37, %.lr.ph ], [ %178, %._crit_edge ] + %136 = phi float [ %18, %.lr.ph ], [ %166, %._crit_edge ] + %137 = phi float [ %16, %.lr.ph ], [ %161, %._crit_edge ] + %138 = phi float [ %13, %.lr.ph ], [ %.pre14, %._crit_edge ] + %139 = phi float [ %10, %.lr.ph ], [ %.pre, %._crit_edge ] + %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %._crit_edge ] + %140 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 0 + %141 = load float* %140, align 4, !tbaa !9 + %142 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 1 + %143 = load float* %142, align 4, !tbaa !11 + %144 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 2 + %145 = load float* %144, align 4, !tbaa !12 + %146 = getelementptr inbounds %struct.kValues* %ck, i64 %indvars.iv, i32 3 + %147 = load float* %146, align 4, !tbaa !13 + %148 = fpext float %147 to double + %149 = load float* %8, align 16, !tbaa !7 + %150 = fmul fast float %141, %149 + %151 = fmul fast float %143, %139 + %152 = fadd fast float %150, %151 + %153 = fmul fast float %145, %138 + %154 = fadd fast float %152, %153 + %155 = fmul fast float %154, 0x401921FB60000000 + %156 = fpext float %155 to double + %157 = tail call double @cos(double %156) #9 + %158 = fmul fast double %148, %157 + %159 = fpext float %137 to double + %160 = fadd fast double %158, %159 + %161 = fptrunc double %160 to float + %162 = tail call double @sin(double %156) #9 + %163 = fmul fast double %148, %162 + %164 = fpext float %136 to double + %165 = fadd fast double %163, %164 + %166 = fptrunc double %165 to float + store float %166, float* %19, align 16, !tbaa !7 + %167 = fmul fast float %141, %28 + %168 = fmul fast float %143, %31 + %169 = fadd fast float %167, %168 + %170 = fmul fast float %145, %34 + %171 = fadd fast float %169, %170 + %172 = fmul fast float %171, 0x401921FB60000000 + %173 = fpext float %172 to double + %174 = tail call double @cos(double %173) #9 + %175 = fmul fast double %148, %174 + %176 = fpext float %135 to double + %177 = fadd fast double %175, %176 + %178 = fptrunc double %177 to float + %179 = tail call double @sin(double %173) #9 + %180 = fmul fast double %148, %179 + %181 = fpext float %134 to double + %182 = fadd fast double %180, %181 + %183 = fptrunc double %182 to float + store float %183, float* %40, align 4, !tbaa !7 + %184 = fmul fast float %141, %49 + %185 = fmul fast float %143, %52 + %186 = fadd fast float %184, %185 + %187 = fmul fast float %145, %55 + %188 = fadd fast float %186, %187 + %189 = fmul fast float %188, 0x401921FB60000000 + %190 = fpext float %189 to double + %191 = tail call double @cos(double %190) #9 + %192 = fmul fast double %148, %191 + %193 = fpext float %133 to double + %194 = fadd fast double %192, %193 + %195 = fptrunc double %194 to float + %196 = tail call double @sin(double %190) #9 + %197 = fmul fast double %148, %196 + %198 = fpext float %132 to double + %199 = fadd fast double %197, %198 + %200 = fptrunc double %199 to float + store float %200, float* %61, align 8, !tbaa !7 + %201 = fmul fast float %141, %70 + %202 = fmul fast float %143, %73 + %203 = fadd fast float %201, %202 + %204 = fmul fast float %145, %76 + %205 = fadd fast float %203, %204 + %206 = fmul fast float %205, 0x401921FB60000000 + %207 = fpext float %206 to double + %208 = tail call double @cos(double %207) #9 + %209 = fmul fast double %148, %208 + %210 = fpext float %131 to double + %211 = fadd fast double %209, %210 + %212 = fptrunc double %211 to float + %213 = tail call double @sin(double %207) #9 + %214 = fmul fast double %148, %213 + %215 = load float* %82, align 4, !tbaa !7 + %216 = fpext float %215 to double + %217 = fadd fast double %214, %216 + %218 = fptrunc double %217 to float + store float %218, float* %82, align 4, !tbaa !7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %86 + br i1 %exitcond, label %.preheader, label %._crit_edge + +._crit_edge: ; preds = %130 + %.pre = load float* %11, align 16, !tbaa !7 + %.pre14 = load float* %14, align 16, !tbaa !7 + br label %130 +} + +declare i32 @get_group_id(...) #4 + +declare i32 @get_local_id(...) #4 + +; Function Attrs: nounwind readnone +declare double @cos(double) #5 + +; Function Attrs: nounwind readnone +declare double @sin(double) #5 + +; Function Attrs: noinline nounwind uwtable +define void @computeQ(i32 %numK, i32 %numX, float* nocapture readonly %x, float* nocapture readonly %y, float* nocapture readonly %z, %struct.kValues* nocapture readonly %kVals, float* nocapture %Qr, float* nocapture %Qi) #3 { + %1 = sdiv i32 %numK, 1024 + %2 = and i32 %numK, 1023 + %not. = icmp ne i32 %2, 0 + %3 = zext i1 %not. to i32 + %. = add i32 %1, %3 + %4 = sdiv i32 %numX, 256 + %5 = and i32 %numX, 255 + %not.1 = icmp ne i32 %5, 0 + %6 = zext i1 %not.1 to i32 + %QBlocks.0 = add nsw i32 %4, %6 + %7 = shl nsw i32 %QBlocks.0, 8 + %8 = sdiv i32 %7, 4 + %9 = sext i32 %8 to i64 + %10 = icmp sgt i32 %., 0 + br i1 %10, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %11 = trunc i64 %indvars.iv to i32 + %12 = shl nsw i32 %11, 10 + %13 = sext i32 %12 to i64 + %14 = getelementptr inbounds %struct.kValues* %kVals, i64 %13 + tail call void @computeQ_kernel(i32 %numK, i32 %12, float* %x, i64 undef, float* %y, i64 undef, float* %z, i64 undef, float* %Qr, i64 undef, float* %Qi, i64 undef, %struct.kValues* %14, i64 undef) + %15 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str4, i64 0, i64 0), i64 %9, i64 64) #6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %. + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +; Function Attrs: nounwind +declare i32 @printf(i8* nocapture readonly, ...) #1 + +; Function Attrs: nounwind uwtable +define void @createDataStructsCPU(i32 %numK, i32 %numX, float** nocapture %phiMag, float** nocapture %Qr, float** nocapture %Qi) #0 { + %1 = sext i32 %numK to i64 + %2 = shl nsw i64 %1, 2 + %3 = tail call noalias i8* @memalign(i64 16, i64 %2) #6 + %4 = bitcast i8* %3 to float* + store float* %4, float** %phiMag, align 8, !tbaa !1 + %5 = sext i32 %numX to i64 + %6 = shl nsw i64 %5, 2 + %7 = tail call noalias i8* @memalign(i64 16, i64 %6) #6 + %8 = bitcast i8* %7 to float* + store float* %8, float** %Qr, align 8, !tbaa !1 + %9 = tail call noalias i8* @memalign(i64 16, i64 %6) #6 + %10 = bitcast i8* %9 to float* + store float* %10, float** %Qi, align 8, !tbaa !1 + ret void +} + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** %argv) #0 { + %1 = alloca i32, align 4 + %numX = alloca i32, align 4 + %original_numK = alloca i32, align 4 + %kx = alloca float*, align 8 + %ky = alloca float*, align 8 + %kz = alloca float*, align 8 + %x = alloca float*, align 8 + %y = alloca float*, align 8 + %z = alloca float*, align 8 + %phiR = alloca float*, align 8 + %phiI = alloca float*, align 8 + %timers = alloca %struct.pb_TimerSet, align 8 + %end = alloca i8*, align 8 + store i32 %argc, i32* %1, align 4, !tbaa !5 + %2 = bitcast %struct.pb_TimerSet* %timers to i8* + call void @llvm.lifetime.start(i64 288, i8* %2) #6 + call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #6 + %3 = call %struct.pb_Parameters* @pb_ReadParameters(i32* %1, i8** %argv) #6 + %4 = getelementptr inbounds %struct.pb_Parameters* %3, i64 0, i32 1 + %5 = load i8*** %4, align 8, !tbaa !14 + %6 = load i8** %5, align 8, !tbaa !1 + %7 = icmp eq i8* %6, null + br i1 %7, label %12, label %8 + +; <label>:8 ; preds = %0 + %9 = getelementptr inbounds i8** %5, i64 1 + %10 = load i8** %9, align 8, !tbaa !1 + %11 = icmp eq i8* %10, null + br i1 %11, label %15, label %12 + +; <label>:12 ; preds = %8, %0 + %13 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1 + %14 = call i64 @fwrite(i8* getelementptr inbounds ([30 x i8]* @.str5, i64 0, i64 0), i64 29, i64 1, %struct._IO_FILE* %13) #7 + call void @exit(i32 -1) #8 + unreachable + +; <label>:15 ; preds = %8 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #6 + %16 = load i8*** %4, align 8, !tbaa !14 + %17 = load i8** %16, align 8, !tbaa !1 + call void @inputData(i8* %17, i32* %original_numK, i32* %numX, float** %kx, float** %ky, float** %kz, float** %x, float** %y, float** %z, float** %phiR, float** %phiI) + %18 = load i32* %1, align 4, !tbaa !5 + %19 = icmp slt i32 %18, 2 + br i1 %19, label %20, label %22 + +; <label>:20 ; preds = %15 + %21 = load i32* %original_numK, align 4, !tbaa !5 + br label %37 + +; <label>:22 ; preds = %15 + %23 = getelementptr inbounds i8** %argv, i64 1 + %24 = load i8** %23, align 8, !tbaa !1 + %25 = call i64 @strtol(i8* %24, i8** %end, i32 10) #6 + %26 = trunc i64 %25 to i32 + %27 = load i8** %end, align 8, !tbaa !1 + %28 = load i8** %23, align 8, !tbaa !1 + %29 = icmp eq i8* %27, %28 + br i1 %29, label %30, label %33 + +; <label>:30 ; preds = %22 + %31 = load %struct._IO_FILE** @stderr, align 8, !tbaa !1 + %32 = call i64 @fwrite(i8* getelementptr inbounds ([32 x i8]* @.str6, i64 0, i64 0), i64 31, i64 1, %struct._IO_FILE* %31) #7 + call void @exit(i32 -1) #8 + unreachable + +; <label>:33 ; preds = %22 + %34 = load i32* %original_numK, align 4, !tbaa !5 + %35 = icmp slt i32 %26, %34 + %36 = select i1 %35, i32 %26, i32 %34 + br label %37 + +; <label>:37 ; preds = %33, %20 + %38 = phi i32 [ %21, %20 ], [ %34, %33 ] + %numK.0 = phi i32 [ %21, %20 ], [ %36, %33 ] + %39 = load i32* %numX, align 4, !tbaa !5 + %40 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([65 x i8]* @.str7, i64 0, i64 0), i32 %39, i32 %38, i32 %numK.0) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6 + %41 = sext i32 %numK.0 to i64 + %42 = shl nsw i64 %41, 2 + %43 = call noalias i8* @memalign(i64 16, i64 %42) #6 + %44 = bitcast i8* %43 to float* + %45 = sext i32 %39 to i64 + %46 = shl nsw i64 %45, 2 + %47 = call noalias i8* @memalign(i64 16, i64 %46) #6 + %48 = bitcast i8* %47 to float* + %49 = call noalias i8* @memalign(i64 16, i64 %46) #6 + %50 = bitcast i8* %49 to float* + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 2) #6 + %51 = load float** %phiR, align 8, !tbaa !1 + %52 = load float** %phiI, align 8, !tbaa !1 + call void @computePhiMag(i32 %numK.0, float* %51, float* %52, float* %44) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6 + %53 = call noalias i8* @calloc(i64 %41, i64 16) #6 + %54 = bitcast i8* %53 to %struct.kValues* + %55 = icmp sgt i32 %numK.0, 0 + br i1 %55, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %37 + %56 = load float** %kx, align 8, !tbaa !1 + %57 = load float** %ky, align 8, !tbaa !1 + %58 = load float** %kz, align 8, !tbaa !1 + br label %59 + +; <label>:59 ; preds = %59, %.lr.ph + %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %59 ] + %60 = getelementptr inbounds float* %56, i64 %indvars.iv + %61 = load float* %60, align 4, !tbaa !7 + %62 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 0 + store float %61, float* %62, align 4, !tbaa !9 + %63 = getelementptr inbounds float* %57, i64 %indvars.iv + %64 = load float* %63, align 4, !tbaa !7 + %65 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 1 + store float %64, float* %65, align 4, !tbaa !11 + %66 = getelementptr inbounds float* %58, i64 %indvars.iv + %67 = load float* %66, align 4, !tbaa !7 + %68 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 2 + store float %67, float* %68, align 4, !tbaa !12 + %69 = getelementptr inbounds float* %44, i64 %indvars.iv + %70 = load float* %69, align 4, !tbaa !7 + %71 = getelementptr inbounds %struct.kValues* %54, i64 %indvars.iv, i32 3 + store float %70, float* %71, align 4, !tbaa !13 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %numK.0 + br i1 %exitcond, label %._crit_edge, label %59 + +._crit_edge: ; preds = %59, %37 + call void @free(i8* %43) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 2) #6 + %72 = load float** %x, align 8, !tbaa !1 + %73 = load float** %y, align 8, !tbaa !1 + %74 = load float** %z, align 8, !tbaa !1 + call void @computeQ(i32 %numK.0, i32 %39, float* %72, float* %73, float* %74, %struct.kValues* %54, float* %48, float* %50) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6 + %75 = getelementptr inbounds %struct.pb_Parameters* %3, i64 0, i32 0 + %76 = load i8** %75, align 8, !tbaa !16 + %77 = icmp eq i8* %76, null + br i1 %77, label %80, label %78 + +; <label>:78 ; preds = %._crit_edge + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #6 + %79 = load i8** %75, align 8, !tbaa !16 + call void @outputData(i8* %79, float* %48, float* %50, i32 %39) + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #6 + br label %80 + +; <label>:80 ; preds = %._crit_edge, %78 + %81 = load float** %kx, align 8, !tbaa !1 + %82 = bitcast float* %81 to i8* + call void @free(i8* %82) #6 + %83 = load float** %ky, align 8, !tbaa !1 + %84 = bitcast float* %83 to i8* + call void @free(i8* %84) #6 + %85 = load float** %kz, align 8, !tbaa !1 + %86 = bitcast float* %85 to i8* + call void @free(i8* %86) #6 + %87 = bitcast float* %72 to i8* + call void @free(i8* %87) #6 + %88 = bitcast float* %73 to i8* + call void @free(i8* %88) #6 + %89 = bitcast float* %74 to i8* + call void @free(i8* %89) #6 + %90 = bitcast float* %51 to i8* + call void @free(i8* %90) #6 + %91 = bitcast float* %52 to i8* + call void @free(i8* %91) #6 + call void @free(i8* %53) #6 + call void @free(i8* %47) #6 + call void @free(i8* %49) #6 + call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #6 + call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #6 + call void @pb_FreeParameters(%struct.pb_Parameters* %3) #6 + call void @llvm.lifetime.end(i64 288, i8* %2) #6 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #6 + +declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #4 + +declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #4 + +declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #4 + +; Function Attrs: nounwind +declare i64 @strtol(i8* readonly, i8** nocapture, i32) #1 + +; Function Attrs: nounwind +declare noalias i8* @calloc(i64, i64) #1 + +; Function Attrs: nounwind +declare void @free(i8* nocapture) #1 + +declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #4 + +declare void @pb_FreeParameters(%struct.pb_Parameters*) #4 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #6 + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #5 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { nounwind } +attributes #7 = { cold nounwind } +attributes #8 = { noreturn nounwind } +attributes #9 = { nounwind readnone } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.4.2 (tags/RELEASE_34/dot2-final)"} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"any pointer", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !6, metadata !6, i64 0} +!6 = metadata !{metadata !"int", metadata !3, i64 0} +!7 = metadata !{metadata !8, metadata !8, i64 0} +!8 = metadata !{metadata !"float", metadata !3, i64 0} +!9 = metadata !{metadata !10, metadata !8, i64 0} +!10 = metadata !{metadata !"kValues", metadata !8, i64 0, metadata !8, i64 4, metadata !8, i64 8, metadata !8, i64 12} +!11 = metadata !{metadata !10, metadata !8, i64 4} +!12 = metadata !{metadata !10, metadata !8, i64 8} +!13 = metadata !{metadata !10, metadata !8, i64 12} +!14 = metadata !{metadata !15, metadata !2, i64 8} +!15 = metadata !{metadata !"pb_Parameters", metadata !2, i64 0, metadata !2, i64 8} +!16 = metadata !{metadata !15, metadata !2, i64 0} -- GitLab