From a5ac493b473dd20419e72c0d6a08674d20cb534b Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Sun, 19 Oct 2014 18:24:00 +0000
Subject: [PATCH] Preparing test case for GPU

---
 llvm/test/VISC/gemm_opencl/matrixMul/Makefile |   2 +-
 .../VISC/gemm_opencl/matrixMul/gemm_opencl.c  | 116 +--
 .../gemm_opencl/matrixMul/visc_gemm_opencl.ll | 719 ++++++++++++++++++
 3 files changed, 781 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll

diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
index 8426617b18..6e4d4a0521 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
@@ -27,4 +27,4 @@ $(HOST:%=%.bin):%.bin:%.c
 	$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
 
 clean :
-	rm -f *.ll *.bc *.s *.bin
+	rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
index 89c535add0..a84f67e438 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
@@ -109,52 +109,9 @@ int checkResults(float* A, float* B, float* C) {
   return 1; // Success
 }
 
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
-
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
-
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/*
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
-
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
-
-   // Initialize OpenCL
-
-   // OpenCL specific variables
+// GPU Computation of MatrixMul
+void computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C) {
+ // OpenCL specific variables
   cl_context clGPUContext;
   cl_command_queue clCommandQue;
   cl_program clProgram;
@@ -291,16 +248,7 @@ int main(int argc, char** argv) {
   }
   printf("\n");
  */
-  if(checkResults(h_A, h_B, h_C))
-    printf("\nPass!\n");
-  else
-    printf("\nFailed!\n");
-  printf("\nDone!\n");
-
   // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
 
   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
@@ -315,3 +263,61 @@ int main(int argc, char** argv) {
 
 }
 
+// Main
+int main(int argc, char** argv) {
+
+  // seed for rand()
+  srand(2006);
+
+  // Allocate host memory for matrices A and B
+  unsigned int size_A = WA * HA;
+  unsigned int bytes_A = sizeof(float) * size_A;
+  float* h_A = (float*) malloc(bytes_A);
+
+  unsigned int size_B = WB * HB;
+  unsigned int bytes_B = sizeof(float) * size_B;
+  float* h_B = (float*) malloc(bytes_B);
+
+   // Initialize host memory
+   randomInit(h_A, size_A);
+   randomInit(h_B, size_B);
+
+/*
+   // Print A and B
+   printf("\n\nMatrix A\n");
+   for(int i = 0; i < size_A; i++)
+   {
+      printf("%f ", h_A[i]);
+      if(((i + 1) % WA) == 0)
+      printf("\n");
+   }
+
+   printf("\n\nMatrix B\n");
+   for(int i = 0; i < size_B; i++)
+   {
+      printf("%f ", h_B[i]);
+      if(((i + 1) % WB) == 0)
+      printf("\n");
+   }
+*/
+
+  // Allocate host memory for the result matrix C
+  unsigned int size_C = WC * HC;
+  unsigned int bytes_C = sizeof(float) * size_C;
+  float* h_C = (float*) malloc(bytes_C);
+
+   // Compute using OpenCL
+  computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C);
+
+  if(checkResults(h_A, h_B, h_C))
+    printf("\nPass!\n");
+  else
+    printf("\nFailed!\n");
+  printf("\nDone!\n");
+
+  // Deallocate memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+}
+
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
new file mode 100644
index 0000000000..da7bd5ec6a
--- /dev/null
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
@@ -0,0 +1,719 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
+; RUN: %t.bin 5
+; ModuleID = 'gemm_opencl.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+%struct._cl_mem = type opaque
+%struct._cl_platform_id = type opaque
+%struct._cl_context = type opaque
+%struct._cl_device_id = type opaque
+%struct._cl_command_queue = type opaque
+%struct._cl_program = type opaque
+%struct._cl_kernel = type opaque
+%struct._cl_event = type opaque
+
+; Return Type of VISC Compute Matrix Mul
+%rtype = type {float*, i32}
+%struct.arg = type { float*, i32, float*, i32, %rtype }
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.launch(i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.visc.wait(i8*) #0
+
+@.str = private unnamed_addr constant [3 x i8] c"rb\00", align 1
+@.str1 = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
+@.str2 = private unnamed_addr constant [35 x i8] c"Failure to get number of platforms\00", align 1
+@.str3 = private unnamed_addr constant [28 x i8] c"Failure to get platform IDs\00", align 1
+@.str4 = private unnamed_addr constant [12 x i8] c"  -- %d --\0A\00", align 1
+@.str5 = private unnamed_addr constant [16 x i8] c"  PROFILE = %s\0A\00", align 1
+@.str6 = private unnamed_addr constant [16 x i8] c"  VERSION = %s\0A\00", align 1
+@.str7 = private unnamed_addr constant [13 x i8] c"  NAME = %s\0A\00", align 1
+@.str8 = private unnamed_addr constant [15 x i8] c"  VENDOR = %s\0A\00", align 1
+@.str9 = private unnamed_addr constant [19 x i8] c"  EXTENSIONS = %s\0A\00", align 1
+@.str10 = private unnamed_addr constant [30 x i8] c"Failure to create GPU context\00", align 1
+@.str11 = private unnamed_addr constant [28 x i8] c"Failure to get context info\00", align 1
+@.str12 = private unnamed_addr constant [32 x i8] c"Failure to create command queue\00", align 1
+@.str13 = private unnamed_addr constant [18 x i8] c"matrixMul.nvptx.s\00", align 1
+@.str14 = private unnamed_addr constant [31 x i8] c"Failure to load Program Binary\00", align 1
+@.str15 = private unnamed_addr constant [38 x i8] c"Failure to create program from binary\00", align 1
+@.str16 = private unnamed_addr constant [25 x i8] c"Failure to build program\00", align 1
+@.str17 = private unnamed_addr constant [10 x i8] c"matrixMul\00", align 1
+@.str18 = private unnamed_addr constant [25 x i8] c"Failure to create kernel\00", align 1
+@.str19 = private unnamed_addr constant [32 x i8] c"Failure to set kernel arguments\00", align 1
+@.str20 = private unnamed_addr constant [26 x i8] c"Failure to enqueue kernel\00", align 1
+@.str21 = private unnamed_addr constant [23 x i8] c"Failure to read buffer\00", align 1
+@stderr = external global %struct._IO_FILE*
+@.str25 = private unnamed_addr constant [11 x i8] c"ERROR: %s\0A\00", align 1
+@str = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
+@str26 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
+@str27 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
+
+; Function Attrs: nounwind uwtable
+define void @randomInit(float* nocapture %data, i32 %size) #0 {
+entry:
+  %cmp3 = icmp sgt i32 %size, 0
+  br i1 %cmp3, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %call = tail call i32 @rand() #4
+  %conv = sitofp i32 %call to float
+  %div = fmul float %conv, 0x3E00000000000000
+  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
+  store float %div, float* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @rand() #1
+
+; Function Attrs: nounwind uwtable
+define noalias i8* @LoadProgSource(i8* nocapture %cFilename, i64* %szFinalLength) #0 {
+entry:
+  %call = tail call %struct._IO_FILE* @fopen(i8* %cFilename, i8* getelementptr inbounds ([3 x i8]* @.str, i64 0, i64 0)) #4
+  %cmp = icmp eq %struct._IO_FILE* %call, null
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %call1 = tail call i32 @fseek(%struct._IO_FILE* %call, i64 0, i32 2) #4
+  %call2 = tail call i64 @ftell(%struct._IO_FILE* %call) #4
+  %call3 = tail call i32 @fseek(%struct._IO_FILE* %call, i64 0, i32 0) #4
+  %add = add i64 %call2, 1
+  %call4 = tail call noalias i8* @malloc(i64 %add) #4
+  %call5 = tail call i64 @fread(i8* %call4, i64 %call2, i64 1, %struct._IO_FILE* %call) #4
+  %cmp6 = icmp eq i64 %call5, 1
+  %call8 = tail call i32 @fclose(%struct._IO_FILE* %call) #4
+  br i1 %cmp6, label %if.end9, label %if.then7
+
+if.then7:                                         ; preds = %if.end
+  tail call void @free(i8* %call4) #4
+  br label %return
+
+if.end9:                                          ; preds = %if.end
+  %cmp11 = icmp eq i64* %szFinalLength, null
+  br i1 %cmp11, label %if.end13, label %if.then12
+
+if.then12:                                        ; preds = %if.end9
+  store i64 %call2, i64* %szFinalLength, align 8, !tbaa !3
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.end9, %if.then12
+  %arrayidx = getelementptr inbounds i8* %call4, i64 %call2
+  store i8 0, i8* %arrayidx, align 1, !tbaa !1
+  br label %return
+
+return:                                           ; preds = %entry, %if.end13, %if.then7
+  %retval.0 = phi i8* [ null, %if.then7 ], [ %call4, %if.end13 ], [ null, %entry ]
+  ret i8* %retval.0
+}
+
+; Function Attrs: nounwind
+declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) #1
+
+; Function Attrs: nounwind
+declare i64 @ftell(%struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64) #1
+
+; Function Attrs: nounwind
+declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind
+declare i32 @fclose(%struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #1
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @isEqual(float %a, float %b) #2 {
+entry:
+  %sub = fsub float %a, %b
+  %fabsf = tail call float @fabsf(float %sub) #6
+  %0 = fpext float %fabsf to double
+  %cmp = fcmp olt double %0, 1.000000e-03
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #0 {
+entry:
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %entry, %for.inc50
+  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
+  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
+  %0 = shl nsw i64 %indvars.iv92, 10
+  br label %for.body7
+
+for.cond4:                                        ; preds = %for.end
+  %inc48 = add nsw i32 %j.079, 1
+  %1 = trunc i64 %indvars.iv.next89 to i32
+  %cmp5 = icmp slt i32 %1, 1024
+  br i1 %cmp5, label %for.body7, label %for.inc50
+
+for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
+  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
+  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
+  %2 = add nsw i64 %indvars.iv88, %0
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12, %for.body7
+  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
+  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
+  %4 = add nsw i64 %indvars.iv, %0
+  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
+  %5 = load float* %arrayidx16, align 4, !tbaa !0
+  %6 = shl i64 %indvars.iv, 10
+  %7 = add nsw i64 %6, %indvars.iv88
+  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
+  %8 = load float* %arrayidx20, align 4, !tbaa !0
+  %mul21 = fmul float %5, %8
+  %add26 = fadd float %3, %mul21
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body12
+
+for.end:                                          ; preds = %for.body12
+  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
+  %9 = load float* %arrayidx34, align 4, !tbaa !0
+  %sub.i = fsub float %add26, %9
+  %fabsf.i = tail call float @fabsf(float %sub.i) #6
+  %10 = fpext float %fabsf.i to double
+  %cmp.i = fcmp olt double %10, 1.000000e-03
+  %indvars.iv.next89 = add i64 %indvars.iv88, 1
+  br i1 %cmp.i, label %for.cond4, label %if.then
+
+if.then:                                          ; preds = %for.end
+  %conv40 = fpext float %9 to double
+  %conv45 = fpext float %add26 to double
+  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #4
+  br label %return
+
+for.inc50:                                        ; preds = %for.cond4
+  %indvars.iv.next93 = add i64 %indvars.iv92, 1
+  %inc51 = add nsw i32 %i.081, 1
+  %11 = trunc i64 %indvars.iv.next93 to i32
+  %cmp = icmp slt i32 %11, 1024
+  br i1 %cmp, label %for.cond4.preheader, label %return
+
+return:                                           ; preds = %for.inc50, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #1
+
+; Function Attrs: nounwind uwtable
+define void @computeMatrixMul(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C) #0 {
+entry:
+  %dataBytes = alloca i64, align 8
+  %errcode = alloca i32, align 4
+  %d_A = alloca %struct._cl_mem*, align 8
+  %d_B = alloca %struct._cl_mem*, align 8
+  %d_C = alloca %struct._cl_mem*, align 8
+  %numPlatforms = alloca i32, align 4
+  %buffer = alloca [10240 x i8], align 16
+  %properties = alloca [3 x i64], align 16
+  %binaryLength = alloca i64, align 8
+  %clMatrixMul = alloca i8*, align 8
+  %binaryStatus = alloca i32, align 4
+  %localWorkSize = alloca [2 x i64], align 16
+  %globalWorkSize = alloca [2 x i64], align 16
+  %wA = alloca i32, align 4
+  %wC = alloca i32, align 4
+  %call = call i32 @clGetPlatformIDs(i32 0, %struct._cl_platform_id** null, i32* %numPlatforms) #4
+  store i32 %call, i32* %errcode, align 4, !tbaa !4
+  %cmp.i = icmp eq i32 %call, 0
+  br i1 %cmp.i, label %checkErr.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  %0 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([35 x i8]* @.str2, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit:                                    ; preds = %entry
+  %1 = load i32* %numPlatforms, align 4, !tbaa !4
+  %2 = zext i32 %1 to i64
+  %vla = alloca %struct._cl_platform_id*, i64 %2, align 16
+  %call1 = call i32 @clGetPlatformIDs(i32 %1, %struct._cl_platform_id** %vla, i32* null) #4
+  store i32 %call1, i32* %errcode, align 4, !tbaa !4
+  %cmp.i105 = icmp eq i32 %call1, 0
+  br i1 %cmp.i105, label %for.cond.preheader, label %if.then.i107
+
+for.cond.preheader:                               ; preds = %checkErr.exit
+  %3 = load i32* %numPlatforms, align 4, !tbaa !4
+  %cmp148 = icmp eq i32 %3, 0
+  br i1 %cmp148, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %for.cond.preheader
+  %4 = getelementptr inbounds [10240 x i8]* %buffer, i64 0, i64 0
+  br label %for.body
+
+if.then.i107:                                     ; preds = %checkErr.exit
+  %5 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i106 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([28 x i8]* @.str3, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.0149 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  call void @llvm.lifetime.start(i64 10240, i8* %4) #4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str4, i64 0, i64 0), i32 %i.0149) #4
+  %idxprom = zext i32 %i.0149 to i64
+  %arrayidx = getelementptr inbounds %struct._cl_platform_id** %vla, i64 %idxprom
+  %6 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
+  %call3 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %6, i32 2304, i64 10240, i8* %4, i64* null) #4
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8]* @.str5, i64 0, i64 0), i8* %4) #4
+  %7 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
+  %call9 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %7, i32 2305, i64 10240, i8* %4, i64* null) #4
+  %call11 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8]* @.str6, i64 0, i64 0), i8* %4) #4
+  %8 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
+  %call15 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %8, i32 2306, i64 10240, i8* %4, i64* null) #4
+  %call17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str7, i64 0, i64 0), i8* %4) #4
+  %9 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
+  %call21 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %9, i32 2307, i64 10240, i8* %4, i64* null) #4
+  %call23 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str8, i64 0, i64 0), i8* %4) #4
+  %10 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
+  %call27 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %10, i32 2308, i64 10240, i8* %4, i64* null) #4
+  %call29 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([19 x i8]* @.str9, i64 0, i64 0), i8* %4) #4
+  call void @llvm.lifetime.end(i64 10240, i8* %4) #4
+  %inc = add i32 %i.0149, 1
+  %11 = load i32* %numPlatforms, align 4, !tbaa !4
+  %cmp = icmp ult i32 %inc, %11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %for.cond.preheader
+  %arrayinit.begin = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 0
+  store i64 4228, i64* %arrayinit.begin, align 16, !tbaa !3
+  %arrayinit.element = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 1
+  %12 = load %struct._cl_platform_id** %vla, align 16, !tbaa !5
+  %13 = ptrtoint %struct._cl_platform_id* %12 to i64
+  %sext = shl i64 %13, 32
+  %conv = ashr exact i64 %sext, 32
+  store i64 %conv, i64* %arrayinit.element, align 8, !tbaa !3
+  %arrayinit.element31 = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 2
+  store i64 0, i64* %arrayinit.element31, align 16, !tbaa !3
+  %call33 = call %struct._cl_context* @clCreateContextFromType(i64* %arrayinit.begin, i64 4, void (i8*, i8*, i64, i8*)* null, i8* null, i32* %errcode) #4
+  %14 = load i32* %errcode, align 4, !tbaa !4
+  %cmp.i109 = icmp eq i32 %14, 0
+  br i1 %cmp.i109, label %checkErr.exit112, label %if.then.i111
+
+if.then.i111:                                     ; preds = %for.end
+  %15 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i110 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %15, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([30 x i8]* @.str10, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit112:                                 ; preds = %for.end
+  %call34 = call i32 @clGetContextInfo(%struct._cl_context* %call33, i32 4225, i64 0, i8* null, i64* %dataBytes) #4
+  store i32 %call34, i32* %errcode, align 4, !tbaa !4
+  %16 = load i64* %dataBytes, align 8, !tbaa !3
+  %call35 = call noalias i8* @malloc(i64 %16) #4
+  %17 = bitcast i8* %call35 to %struct._cl_device_id**
+  %call36 = call i32 @clGetContextInfo(%struct._cl_context* %call33, i32 4225, i64 %16, i8* %call35, i64* null) #4
+  %18 = load i32* %errcode, align 4, !tbaa !4
+  %or = or i32 %18, %call36
+  store i32 %or, i32* %errcode, align 4, !tbaa !4
+  %cmp.i113 = icmp eq i32 %or, 0
+  br i1 %cmp.i113, label %checkErr.exit116, label %if.then.i115
+
+if.then.i115:                                     ; preds = %checkErr.exit112
+  %19 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i114 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %19, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([28 x i8]* @.str11, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit116:                                 ; preds = %checkErr.exit112
+  %20 = load %struct._cl_device_id** %17, align 8, !tbaa !5
+  %call38 = call %struct._cl_command_queue* @clCreateCommandQueue(%struct._cl_context* %call33, %struct._cl_device_id* %20, i64 0, i32* %errcode) #4
+  %21 = load i32* %errcode, align 4, !tbaa !4
+  %cmp.i117 = icmp eq i32 %21, 0
+  br i1 %cmp.i117, label %checkErr.exit120, label %if.then.i119
+
+if.then.i119:                                     ; preds = %checkErr.exit116
+  %22 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i118 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %22, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8]* @.str12, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit120:                                 ; preds = %checkErr.exit116
+  %conv39 = zext i32 %bytes_C to i64
+  %call40 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 1, i64 %conv39, i8* null, i32* %errcode) #4
+  store %struct._cl_mem* %call40, %struct._cl_mem** %d_C, align 8, !tbaa !5
+  %conv41 = zext i32 %bytes_A to i64
+  %23 = bitcast float* %h_A to i8*
+  %call42 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 33, i64 %conv41, i8* %23, i32* %errcode) #4
+  store %struct._cl_mem* %call42, %struct._cl_mem** %d_A, align 8, !tbaa !5
+  %conv43 = zext i32 %bytes_B to i64
+  %24 = bitcast float* %h_B to i8*
+  %call44 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 33, i64 %conv43, i8* %24, i32* %errcode) #4
+  store %struct._cl_mem* %call44, %struct._cl_mem** %d_B, align 8, !tbaa !5
+  %call45 = call i8* @LoadProgSource(i8* getelementptr inbounds ([18 x i8]* @.str13, i64 0, i64 0), i64* %binaryLength)
+  store i8* %call45, i8** %clMatrixMul, align 8, !tbaa !5
+  %cmp46 = icmp eq i8* %call45, null
+  br i1 %cmp46, label %if.then.i122, label %checkErr.exit123
+
+if.then.i122:                                     ; preds = %checkErr.exit120
+  %25 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i121 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %25, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([31 x i8]* @.str14, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit123:                                 ; preds = %checkErr.exit120
+  %call49 = call %struct._cl_program* @clCreateProgramWithBinary(%struct._cl_context* %call33, i32 1, %struct._cl_device_id** %17, i64* %binaryLength, i8** %clMatrixMul, i32* %binaryStatus, i32* %errcode) #4
+  %26 = load i32* %errcode, align 4, !tbaa !4
+  %cmp.i124 = icmp eq i32 %26, 0
+  br i1 %cmp.i124, label %checkErr.exit127, label %if.then.i126
+
+if.then.i126:                                     ; preds = %checkErr.exit123
+  %27 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i125 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %27, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([38 x i8]* @.str15, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit127:                                 ; preds = %checkErr.exit123
+  %call50 = call i32 @clBuildProgram(%struct._cl_program* %call49, i32 0, %struct._cl_device_id** null, i8* null, void (%struct._cl_program*, i8*)* null, i8* null) #4
+  store i32 %call50, i32* %errcode, align 4, !tbaa !4
+  %cmp.i128 = icmp eq i32 %call50, 0
+  br i1 %cmp.i128, label %checkErr.exit131, label %if.then.i130
+
+if.then.i130:                                     ; preds = %checkErr.exit127
+  %28 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i129 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %28, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([25 x i8]* @.str16, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit131:                                 ; preds = %checkErr.exit127
+  %call51 = call %struct._cl_kernel* @clCreateKernel(%struct._cl_program* %call49, i8* getelementptr inbounds ([10 x i8]* @.str17, i64 0, i64 0), i32* %errcode) #4
+  %29 = load i32* %errcode, align 4, !tbaa !4
+  %cmp.i132 = icmp eq i32 %29, 0
+  br i1 %cmp.i132, label %checkErr.exit135, label %if.then.i134
+
+if.then.i134:                                     ; preds = %checkErr.exit131
+  %30 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i133 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %30, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([25 x i8]* @.str18, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit135:                                 ; preds = %checkErr.exit131
+  store i32 1024, i32* %wA, align 4, !tbaa !4
+  store i32 1024, i32* %wC, align 4, !tbaa !4
+  %31 = bitcast %struct._cl_mem** %d_C to i8*
+  %call52 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 0, i64 8, i8* %31) #4
+  store i32 %call52, i32* %errcode, align 4, !tbaa !4
+  %32 = bitcast %struct._cl_mem** %d_A to i8*
+  %call53 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 1, i64 8, i8* %32) #4
+  %33 = load i32* %errcode, align 4, !tbaa !4
+  %or54 = or i32 %33, %call53
+  store i32 %or54, i32* %errcode, align 4, !tbaa !4
+  %34 = bitcast %struct._cl_mem** %d_B to i8*
+  %call55 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 2, i64 8, i8* %34) #4
+  %35 = load i32* %errcode, align 4, !tbaa !4
+  %or56 = or i32 %35, %call55
+  store i32 %or56, i32* %errcode, align 4, !tbaa !4
+  %36 = bitcast i32* %wA to i8*
+  %call57 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 3, i64 4, i8* %36) #4
+  %37 = load i32* %errcode, align 4, !tbaa !4
+  %or58 = or i32 %37, %call57
+  store i32 %or58, i32* %errcode, align 4, !tbaa !4
+  %38 = bitcast i32* %wC to i8*
+  %call59 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 4, i64 4, i8* %38) #4
+  %39 = load i32* %errcode, align 4, !tbaa !4
+  %or60 = or i32 %39, %call59
+  store i32 %or60, i32* %errcode, align 4, !tbaa !4
+  %cmp.i136 = icmp eq i32 %or60, 0
+  br i1 %cmp.i136, label %checkErr.exit139, label %if.then.i138
+
+if.then.i138:                                     ; preds = %checkErr.exit135
+  %40 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i137 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %40, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8]* @.str19, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit139:                                 ; preds = %checkErr.exit135
+  %arrayidx61 = getelementptr inbounds [2 x i64]* %localWorkSize, i64 0, i64 0
+  store i64 16, i64* %arrayidx61, align 16, !tbaa !3
+  %arrayidx62 = getelementptr inbounds [2 x i64]* %localWorkSize, i64 0, i64 1
+  store i64 16, i64* %arrayidx62, align 8, !tbaa !3
+  %arrayidx63 = getelementptr inbounds [2 x i64]* %globalWorkSize, i64 0, i64 0
+  store i64 1024, i64* %arrayidx63, align 16, !tbaa !3
+  %arrayidx64 = getelementptr inbounds [2 x i64]* %globalWorkSize, i64 0, i64 1
+  store i64 1024, i64* %arrayidx64, align 8, !tbaa !3
+  %call67 = call i32 @clEnqueueNDRangeKernel(%struct._cl_command_queue* %call38, %struct._cl_kernel* %call51, i32 2, i64* null, i64* %arrayidx63, i64* %arrayidx61, i32 0, %struct._cl_event** null, %struct._cl_event** null) #4
+  store i32 %call67, i32* %errcode, align 4, !tbaa !4
+  %cmp.i140 = icmp eq i32 %call67, 0
+  br i1 %cmp.i140, label %checkErr.exit143, label %if.then.i142
+
+if.then.i142:                                     ; preds = %checkErr.exit139
+  %41 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i141 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %41, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([26 x i8]* @.str20, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit143:                                 ; preds = %checkErr.exit139
+  %42 = load %struct._cl_mem** %d_C, align 8, !tbaa !5
+  %43 = bitcast float* %h_C to i8*
+  %call69 = call i32 @clEnqueueReadBuffer(%struct._cl_command_queue* %call38, %struct._cl_mem* %42, i32 1, i64 0, i64 %conv39, i8* %43, i32 0, %struct._cl_event** null, %struct._cl_event** null) #4
+  store i32 %call69, i32* %errcode, align 4, !tbaa !4
+  %cmp.i144 = icmp eq i32 %call69, 0
+  br i1 %cmp.i144, label %checkErr.exit147, label %if.then.i146
+
+if.then.i146:                                     ; preds = %checkErr.exit143
+  %44 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
+  %call.i145 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %44, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([23 x i8]* @.str21, i64 0, i64 0)) #4
+  call void @exit(i32 1) #7
+  unreachable
+
+checkErr.exit147:                                 ; preds = %checkErr.exit143
+  %45 = load %struct._cl_mem** %d_A, align 8, !tbaa !5
+  %call70 = call i32 @clReleaseMemObject(%struct._cl_mem* %45) #4
+  %46 = load %struct._cl_mem** %d_C, align 8, !tbaa !5
+  %call71 = call i32 @clReleaseMemObject(%struct._cl_mem* %46) #4
+  %47 = load %struct._cl_mem** %d_B, align 8, !tbaa !5
+  %call72 = call i32 @clReleaseMemObject(%struct._cl_mem* %47) #4
+  call void @free(i8* %call35) #4
+  %48 = load i8** %clMatrixMul, align 8, !tbaa !5
+  call void @free(i8* %48) #4
+  %call73 = call i32 @clReleaseContext(%struct._cl_context* %call33) #4
+  %call74 = call i32 @clReleaseKernel(%struct._cl_kernel* %call51) #4
+  %call75 = call i32 @clReleaseProgram(%struct._cl_program* %call49) #4
+  %call76 = call i32 @clReleaseCommandQueue(%struct._cl_command_queue* %call38) #4
+  ret void
+}
+
+declare i32 @clGetPlatformIDs(i32, %struct._cl_platform_id**, i32*) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #4
+
+declare i32 @clGetPlatformInfo(%struct._cl_platform_id*, i32, i64, i8*, i64*) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #4
+
+declare %struct._cl_context* @clCreateContextFromType(i64*, i64, void (i8*, i8*, i64, i8*)*, i8*, i32*) #3
+
+declare i32 @clGetContextInfo(%struct._cl_context*, i32, i64, i8*, i64*) #3
+
+declare %struct._cl_command_queue* @clCreateCommandQueue(%struct._cl_context*, %struct._cl_device_id*, i64, i32*) #3
+
+declare %struct._cl_mem* @clCreateBuffer(%struct._cl_context*, i64, i64, i8*, i32*) #3
+
+declare %struct._cl_program* @clCreateProgramWithBinary(%struct._cl_context*, i32, %struct._cl_device_id**, i64*, i8**, i32*, i32*) #3
+
+declare i32 @clBuildProgram(%struct._cl_program*, i32, %struct._cl_device_id**, i8*, void (%struct._cl_program*, i8*)*, i8*) #3
+
+declare %struct._cl_kernel* @clCreateKernel(%struct._cl_program*, i8*, i32*) #3
+
+declare i32 @clSetKernelArg(%struct._cl_kernel*, i32, i64, i8*) #3
+
+declare i32 @clEnqueueNDRangeKernel(%struct._cl_command_queue*, %struct._cl_kernel*, i32, i64*, i64*, i64*, i32, %struct._cl_event**, %struct._cl_event**) #3
+
+declare i32 @clEnqueueReadBuffer(%struct._cl_command_queue*, %struct._cl_mem*, i32, i64, i64, i8*, i32, %struct._cl_event**, %struct._cl_event**) #3
+
+declare i32 @clReleaseMemObject(%struct._cl_mem*) #3
+
+declare i32 @clReleaseContext(%struct._cl_context*) #3
+
+declare i32 @clReleaseKernel(%struct._cl_kernel*) #3
+
+declare i32 @clReleaseProgram(%struct._cl_program*) #3
+
+declare i32 @clReleaseCommandQueue(%struct._cl_command_queue*) #3
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
+entry:
+  tail call void @srand(i32 2006) #4
+  %call = tail call noalias i8* @malloc(i64 4194304) #4
+  %0 = bitcast i8* %call to float*
+  %call7 = tail call noalias i8* @malloc(i64 4194304) #4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
+  %call.i = tail call i32 @rand() #4
+  %conv.i = sitofp i32 %call.i to float
+  %div.i = fmul float %conv.i, 0x3E00000000000000
+  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
+  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
+  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+  %lftr.wideiv59 = trunc i64 %indvars.iv.next.i to i32
+  %exitcond60 = icmp eq i32 %lftr.wideiv59, 1048576
+  br i1 %exitcond60, label %for.body.i40.preheader, label %for.body.i
+
+for.body.i40.preheader:                           ; preds = %for.body.i
+  %1 = bitcast i8* %call7 to float*
+  br label %for.body.i40
+
+for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
+  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
+  %call.i33 = tail call i32 @rand() #4
+  %conv.i34 = sitofp i32 %call.i33 to float
+  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
+  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
+  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
+  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
+  %lftr.wideiv57 = trunc i64 %indvars.iv.next.i37 to i32
+  %exitcond58 = icmp eq i32 %lftr.wideiv57, 1048576
+  br i1 %exitcond58, label %randomInit.exit41, label %for.body.i40
+
+randomInit.exit41:                                ; preds = %for.body.i40
+  %call12 = tail call noalias i8* @malloc(i64 4194304) #4
+  %2 = bitcast i8* %call12 to float*
+  
+  ; ----- Adding VISC Launch Call -----
+  ; Setting up launch input args
+  %in.addr = alloca %struct.arg
+
+  ; Store arguments
+  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
+  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
+  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
+  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
+  store float* %0, float** %in.addr.h_A
+  store i32 4194304, i32* %in.addr.bytes_A
+  store float* %1, float** %in.addr.h_B
+  store i32 4194304, i32* %in.addr.bytes_B
+
+  ; Change type to i8* and VISC Launch call
+  %args = bitcast { float*, i32, float*, i32, %rtype }* %in.addr to i8*
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32)* @MatrixMulRoot to i8*), i8* %args)
+  ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304)
+
+  ; Wait for result
+  call void @llvm.visc.wait(i8* %graphID)
+
+  ; Get the result
+  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 4
+  %out = load %rtype* %out.addr
+  %out.h_C = extractvalue %rtype %out, 0
+  ;%2 = extractvalue %rtype %out, 0
+  %out.bytes_C = extractvalue %rtype %outputstruct, 1
+
+  ; ----- Completed VISC Launch Call Code -----
+  
+  br label %for.cond4.preheader.i
+
+for.cond4.preheader.i:                            ; preds = %for.inc50.i, %randomInit.exit41
+  %indvars.iv92.i = phi i64 [ 0, %randomInit.exit41 ], [ %indvars.iv.next93.i, %for.inc50.i ]
+  %i.081.i = phi i32 [ 0, %randomInit.exit41 ], [ %inc51.i, %for.inc50.i ]
+  %3 = shl nsw i64 %indvars.iv92.i, 10
+  br label %for.body7.i
+
+for.cond4.i:                                      ; preds = %for.end.i
+  %inc48.i = add nsw i32 %j.079.i, 1
+  %4 = trunc i64 %indvars.iv.next89.i to i32
+  %cmp5.i = icmp slt i32 %4, 1024
+  br i1 %cmp5.i, label %for.body7.i, label %for.inc50.i
+
+for.body7.i:                                      ; preds = %for.cond4.i, %for.cond4.preheader.i
+  %indvars.iv88.i = phi i64 [ 0, %for.cond4.preheader.i ], [ %indvars.iv.next89.i, %for.cond4.i ]
+  %j.079.i = phi i32 [ 0, %for.cond4.preheader.i ], [ %inc48.i, %for.cond4.i ]
+  br label %for.body12.i
+
+for.body12.i:                                     ; preds = %for.body12.i, %for.body7.i
+  %indvars.iv.i42 = phi i64 [ 0, %for.body7.i ], [ %indvars.iv.next.i43, %for.body12.i ]
+  %5 = phi float [ 0.000000e+00, %for.body7.i ], [ %add26.i, %for.body12.i ]
+  %6 = add nsw i64 %indvars.iv.i42, %3
+  %arrayidx16.i = getelementptr inbounds float* %0, i64 %6
+  %7 = load float* %arrayidx16.i, align 4, !tbaa !0
+  %8 = shl i64 %indvars.iv.i42, 10
+  %9 = add nsw i64 %8, %indvars.iv88.i
+  %arrayidx20.i = getelementptr inbounds float* %1, i64 %9
+  %10 = load float* %arrayidx20.i, align 4, !tbaa !0
+  %mul21.i = fmul float %7, %10
+  %add26.i = fadd float %5, %mul21.i
+  %indvars.iv.next.i43 = add i64 %indvars.iv.i42, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i43 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end.i, label %for.body12.i
+
+for.end.i:                                        ; preds = %for.body12.i
+  %11 = add nsw i64 %indvars.iv88.i, %3
+  ;%arrayidx34.i = getelementptr inbounds float* %2, i64 %11
+  %arrayidx34.i = getelementptr inbounds float* %out.h_C, i64 %11
+  %12 = load float* %arrayidx34.i, align 4, !tbaa !0
+  %sub.i.i = fsub float %add26.i, %12
+  %fabsf.i.i = tail call float @fabsf(float %sub.i.i) #6
+  %13 = fpext float %fabsf.i.i to double
+  %cmp.i.i = fcmp olt double %13, 1.000000e-03
+  %indvars.iv.next89.i = add i64 %indvars.iv88.i, 1
+  br i1 %cmp.i.i, label %for.cond4.i, label %if.else
+
+for.inc50.i:                                      ; preds = %for.cond4.i
+  %indvars.iv.next93.i = add i64 %indvars.iv92.i, 1
+  %inc51.i = add nsw i32 %i.081.i, 1
+  %14 = trunc i64 %indvars.iv.next93.i to i32
+  %cmp.i = icmp slt i32 %14, 1024
+  br i1 %cmp.i, label %for.cond4.preheader.i, label %if.then
+
+if.then:                                          ; preds = %for.inc50.i
+  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str27, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %for.end.i
+  %conv40.i = fpext float %12 to double
+  %conv45.i = fpext float %add26.i to double
+  %call46.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081.i, i32 %j.079.i, double %conv40.i, double %conv45.i) #4
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str, i64 0, i64 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str26, i64 0, i64 0))
+  tail call void @free(i8* %call) #4
+  tail call void @free(i8* %call7) #4
+  tail call void @free(i8* %call12) #4
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @srand(i32) #1
+
+; Function Attrs: nounwind
+declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture, ...) #1
+
+; Function Attrs: noreturn nounwind
+declare void @exit(i32) #5
+
+declare float @fabsf(float)
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture) #4
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #7 = { noreturn nounwind }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"long", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
+!5 = metadata !{metadata !"any pointer", metadata !1}
-- 
GitLab