diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_2_level_param.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_2_level_param.ll
new file mode 100644
index 0000000000000000000000000000000000000000..69db224a413206727cbecc8a542dc0b353dfe6ff
--- /dev/null
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_2_level_param.ll
@@ -0,0 +1,455 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
+; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
+; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
+; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
+; RUN: %t.bin
+; ModuleID = 'gemm_opencl.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
+@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
+@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
+@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
+@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
+@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
+@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
+@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
+@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
+@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
+@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
+@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
+@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
+
+; Function Attrs: nounwind uwtable
+define void @randomInit(float* nocapture %data, i32 %size) #0 {
+entry:
+  %cmp3 = icmp sgt i32 %size, 0
+  br i1 %cmp3, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %call = tail call i32 @rand() #5
+  %conv = sitofp i32 %call to float
+  %div = fmul float %conv, 0x3E00000000000000
+  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
+  store float %div, float* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @rand() #1
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @isEqual(float %a, float %b) #2 {
+entry:
+  %sub = fsub float %a, %b
+  %fabsf = tail call float @fabsf(float %sub) #6
+  %0 = fpext float %fabsf to double
+  %cmp = fcmp olt double %0, 1.000000e-03
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
+entry:
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %entry, %for.inc50
+  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
+  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
+  %0 = shl nsw i64 %indvars.iv92, 10
+  br label %for.body7
+
+for.cond4:                                        ; preds = %for.end
+  %inc48 = add nsw i32 %j.079, 1
+  %1 = trunc i64 %indvars.iv.next89 to i32
+  %cmp5 = icmp slt i32 %1, 1024
+  br i1 %cmp5, label %for.body7, label %for.inc50
+
+for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
+  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
+  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
+  %2 = add nsw i64 %indvars.iv88, %0
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12, %for.body7
+  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
+  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
+  %4 = add nsw i64 %indvars.iv, %0
+  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
+  %5 = load float* %arrayidx16, align 4, !tbaa !0
+  %6 = shl i64 %indvars.iv, 10
+  %7 = add nsw i64 %6, %indvars.iv88
+  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
+  %8 = load float* %arrayidx20, align 4, !tbaa !0
+  %mul21 = fmul float %5, %8
+  %add26 = fadd float %3, %mul21
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body12
+
+for.end:                                          ; preds = %for.body12
+  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
+  %9 = load float* %arrayidx34, align 4, !tbaa !0
+  %sub.i = fsub float %add26, %9
+  %fabsf.i = tail call float @fabsf(float %sub.i) #6
+  %10 = fpext float %fabsf.i to double
+  %cmp.i = fcmp olt double %10, 1.000000e-03
+  %indvars.iv.next89 = add i64 %indvars.iv88, 1
+  br i1 %cmp.i, label %for.cond4, label %if.then
+
+if.then:                                          ; preds = %for.end
+  %conv40 = fpext float %9 to double
+  %conv45 = fpext float %add26 to double
+  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
+  br label %return
+
+for.inc50:                                        ; preds = %for.cond4
+  %indvars.iv.next93 = add i64 %indvars.iv92, 1
+  %inc51 = add nsw i32 %i.081, 1
+  %11 = trunc i64 %indvars.iv.next93 to i32
+  %cmp = icmp slt i32 %11, 1024
+  br i1 %cmp, label %for.cond4.preheader, label %return
+
+return:                                           ; preds = %for.inc50, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #1
+
+; --------------- VISC Intrinsics ---------------
+; Return Type of VISC Compute Matrix Mul
+%rtype = type {}
+%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32, %rtype }>
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.launch(i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.visc.wait(i8*) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode(i8*) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getNode() #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getParentNode(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumDims(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.input(i8*, i32, i32)
+
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.output(i8*, i32, i32)
+; ----------------- VISC intrinsics end ------------------
+
+; Function Attrs: nounwind uwtable
+define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
+entry:
+  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
+  
+  ; ------------------------- VISC changes ------------------
+  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
+  ; Replaced statement -- 
+  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
+  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
+  %this_node = call i8* @llvm.visc.getNode()
+  %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
+  %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
+  %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
+
+  %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node)
+  %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node)
+  %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node)
+
+  %tmpx = mul i32 %Gx, %LLimitx
+  %tmpy = mul i32 %Gy, %LLimity
+
+  %call1 = add i32 %tmpx, %Lx
+  %call2 = add i32 %tmpy, %Ly
+
+  ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5
+  ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5
+
+  ; ---------------------- VISC changes End ------------------
+
+  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
+  %cmp44 = icmp eq i32 %k, 0
+  br i1 %cmp44, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %mul = mul i32 %call2, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %add = add i32 %0, %mul
+  %mul4 = mul i32 %0, %n
+  %add5 = add i32 %mul4, %call1
+  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
+  %1 = load float* %arrayidx, align 4, !tbaa !0
+  %idxprom11 = zext i32 %add5 to i64
+  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
+  %2 = load float* %arrayidx12, align 4, !tbaa !0
+  %mul13 = fmul float %1, %2
+  %add14 = fadd float %res.046, %mul13
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
+  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
+  %mul16 = mul i32 %call2, %n
+  %add17 = add i32 %mul16, %call1
+  %idxprom18 = zext i32 %add17 to i64
+  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
+  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
+  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
+  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
+  ret %rtype undef
+}
+
+; ----------------- VISC SGEMM root node ----------------
+define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %blocksize) {
+  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %blocksize, i32 %blocksize)
+  ; Bind Inputs
+  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
+  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
+  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
+  ; Bind Outputs
+  ret %rtype undef
+}
+
+; ----------------- VISC SGEMM root node ----------------
+define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %gridsize, i32 %blocksize) {
+  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @MatrixMulInternal to i8*),i32 %gridsize, i32 %gridsize)
+  ; Bind Inputs
+  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
+  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
+  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
+  call void @llvm.visc.bind.input(i8* %kernel, i32 10, i32 9); blocksize
+  ; Bind Outputs
+  ret %rtype undef
+}
+
+; Function Attrs: noinline nounwind uwtable
+;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
+;entry:
+;  %cmp18 = icmp eq i32 %m, 0
+;  %cmp215 = icmp eq i32 %n, 0
+;  %or.cond = or i1 %cmp18, %cmp215
+;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
+;
+;for.inc4.us:                                      ; preds = %for.body3.us
+;  %0 = extractvalue %rtype %call.us, 0
+;  %1 = extractvalue %rtype %call.us, 1
+;  %inc5.us = add i32 %i.019.us, 1
+;  %exitcond24 = icmp eq i32 %inc5.us, %m
+;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
+;
+;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
+;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
+;  %inc.us = add i32 %j.016.us, 1
+;  %exitcond = icmp eq i32 %inc.us, %n
+;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
+;
+;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
+;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
+;  br label %for.body3.us
+;
+;for.end6:                                         ; preds = %for.inc4.us, %entry
+;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
+;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
+;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
+;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
+;  ret %rtype %.fca.1.insert
+;}
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
+entry:
+  tail call void @srand(i32 2006) #5
+  %call = tail call noalias i8* @malloc(i64 4194304) #5
+  %0 = bitcast i8* %call to float*
+  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
+  %call.i = tail call i32 @rand() #5
+  %conv.i = sitofp i32 %call.i to float
+  %div.i = fmul float %conv.i, 0x3E00000000000000
+  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
+  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
+  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
+  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
+  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
+
+for.body.i40.preheader:                           ; preds = %for.body.i
+  %1 = bitcast i8* %call7 to float*
+  br label %for.body.i40
+
+for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
+  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
+  %call.i33 = tail call i32 @rand() #5
+  %conv.i34 = sitofp i32 %call.i33 to float
+  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
+  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
+  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
+  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
+  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
+
+randomInit.exit41:                                ; preds = %for.body.i40
+  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
+  %2 = bitcast i8* %call12 to float*
+
+  ; ---------------------------------- Adding VISC Launch Call --------------------------------
+  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
+  ; Setting up launch input args
+  %in.addr = alloca %struct.arg
+
+  ; Store arguments
+  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
+  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
+  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
+  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
+  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
+  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
+  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
+  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
+  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
+  %in.addr.gridsize = getelementptr %struct.arg* %in.addr, i32 0, i32 9
+  %in.addr.blocksize = getelementptr %struct.arg* %in.addr, i32 0, i32 10
+
+  store float* %0, float** %in.addr.h_A
+  store i64 4194304, i64* %in.addr.bytes_A
+  store float* %1, float** %in.addr.h_B
+  store i64 4194304, i64* %in.addr.bytes_B
+  store float* %2, float** %in.addr.h_C
+  store i64 4194304, i64* %in.addr.bytes_C
+  store i32 1024, i32* %in.addr.WA
+  store i32 1024, i32* %in.addr.WB
+  store i32 1024, i32* %in.addr.HA
+  store i32 64, i32* %in.addr.gridsize
+  store i32 16, i32* %in.addr.blocksize
+
+  ; Change type to i8* and VISC Launch call
+  %args = bitcast %struct.arg* %in.addr to i8*
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
+
+  ; Wait for result
+  call void @llvm.visc.wait(i8* %graphID)
+
+  ; Get the result
+  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 11
+  %out = load %rtype* %out.addr
+  ; -------------------------------- Completed VISC Launch Call --------------------------------
+
+  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
+  %tobool = icmp eq i32 %call14, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %randomInit.exit41
+  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %randomInit.exit41
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
+  tail call void @free(i8* %call) #5
+  tail call void @free(i8* %call7) #5
+  tail call void @free(i8* %call12) #5
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @srand(i32) #1
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #1
+
+declare float @fabsf(float)
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture) #5
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}