From ead1071cf0ea48de134aa2e7b3e5d104d06a8b7f Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Fri, 31 Oct 2014 15:09:35 +0000
Subject: [PATCH] Matrix Multiply example added

---
 llvm/test/VISC/MatrixMultiplication/Makefile  |  30 ++
 llvm/test/VISC/MatrixMultiplication/gemm.c    | 168 +++++++
 .../VISC/MatrixMultiplication/visc_gemm.ll    | 412 ++++++++++++++++++
 .../gemm_opencl/matrixMul/visc_gemm_opencl.ll |   9 +-
 4 files changed, 615 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/VISC/MatrixMultiplication/Makefile
 create mode 100644 llvm/test/VISC/MatrixMultiplication/gemm.c
 create mode 100644 llvm/test/VISC/MatrixMultiplication/visc_gemm.ll

diff --git a/llvm/test/VISC/MatrixMultiplication/Makefile b/llvm/test/VISC/MatrixMultiplication/Makefile
new file mode 100644
index 0000000000..6e4d4a0521
--- /dev/null
+++ b/llvm/test/VISC/MatrixMultiplication/Makefile
@@ -0,0 +1,30 @@
+PASSES :=
+
+.PHONY: clean
+
+LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+HOST:=gemm_opencl
+KERNELS:=matrixMul
+LLVM_CC:=$(LLVM_INSTALL)/bin/clang
+LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
+
+all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
+
+$(KERNELS:%=%.ll):%.ll:%.cl
+	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
+
+$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
+	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
+
+$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
+	$(LLVM_CC) -O3 -target nvptx $< -S -o $@
+
+$(HOST:%=%.ll):%.ll:%.c
+	$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
+
+$(HOST:%=%.bin):%.bin:%.c
+	$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
+
+clean :
+	rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin
diff --git a/llvm/test/VISC/MatrixMultiplication/gemm.c b/llvm/test/VISC/MatrixMultiplication/gemm.c
new file mode 100644
index 0000000000..7356b8293d
--- /dev/null
+++ b/llvm/test/VISC/MatrixMultiplication/gemm.c
@@ -0,0 +1,168 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#define WA 1024
+#define HA 1024
+#define WB 1024
+#define HB WA
+#define WC WB
+#define HC HA
+
+
+
+// Thread block size
+#define BLOCK_SIZE 16
+
+// Allocates a matrix with random float entries.
+void randomInit(float* data, int size) {
+  for (int i = 0; i < size; ++i)
+    data[i] = rand() / (float)RAND_MAX;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+
+// Check bool
+int isEqual(float a, float b) {
+  return (fabs(a-b) < 0.001);
+}
+
+// Check Results
+
+__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
+  unsigned int size_A = WA * HA;
+  unsigned int size_B = WB * HB;
+  unsigned int size_C = WC * HC;
+  unsigned int bytesC = sizeof(float) * size_C;
+  float* goldC = (float*) malloc(bytesC);
+  for (int i=0; i < HC; i++) {
+    for (int j=0; j < WC; j++) {
+      goldC[i*WC + j] = 0;
+      for (int k=0; k < HB; k++) {
+        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
+      }
+      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
+        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
+        return 0;
+      }
+    }
+  }
+  return 1; // Success
+}
+
+
+typedef struct {
+  float* Out;
+  int bytes_Out;
+} rtype;
+
+rtype matrixMul(float* A, int bytes_A, float* B, int bytes_B, float* C, int bytes_C, unsigned k, unsigned n, unsigned m, int idx_x, int idx_y) {
+
+  printf("Entered function\n");
+  int tx = get_global_id(0); //2D Global Thread ID x
+  int ty = get_global_id(1); //2D Global Thread ID y
+  //int tx = get_global_id(0); //2D Global Thread ID x
+  //int ty = get_global_id(1); //2D Global Thread ID y
+
+  printf("Computing element (%d, %d)\n", tx, ty);
+  // Initialize accumulator
+  float res = 0.0f;
+
+  // Perform dot-product of row-column
+  for (int i = 0; i < k; i++) {
+    printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
+    res += A[ty*k+i] * B[i*n+tx];
+  }
+
+  printf("Result computed\n");
+  // Write in device memory
+  C[ty*n+tx] = res;
+
+  printf("Result written to C\n");
+  rtype Output;
+  Output.Out = C;
+  Output.bytes_Out = bytes_C;
+  printf("Output allocated\n");
+  return Output;
+
+}
+
+
+// CPU Computation of MatrixMul
+__attribute__ ((noinline)) rtype computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C, unsigned k, unsigned n, unsigned m ) {
+
+  rtype Out;
+  for(unsigned i=0; i<m; i++) {
+    for(unsigned j=0; j < n; j++) {
+      Out = matrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, k, n, m, i, j);
+    }
+  }
+  return Out;
+
+}
+
+// Main
+int main(int argc, char** argv) {
+
+  // seed for rand()
+  srand(2006);
+
+  // Allocate host memory for matrices A and B
+  unsigned int size_A = WA * HA;
+  unsigned int bytes_A = sizeof(float) * size_A;
+  float* h_A = (float*) malloc(bytes_A);
+
+  unsigned int size_B = WB * HB;
+  unsigned int bytes_B = sizeof(float) * size_B;
+  float* h_B = (float*) malloc(bytes_B);
+
+   // Initialize host memory
+   randomInit(h_A, size_A);
+   randomInit(h_B, size_B);
+
+/*
+   // Print A and B
+   printf("\n\nMatrix A\n");
+   for(int i = 0; i < size_A; i++)
+   {
+      printf("%f ", h_A[i]);
+      if(((i + 1) % WA) == 0)
+      printf("\n");
+   }
+
+   printf("\n\nMatrix B\n");
+   for(int i = 0; i < size_B; i++)
+   {
+      printf("%f ", h_B[i]);
+      if(((i + 1) % WB) == 0)
+      printf("\n");
+   }
+*/
+
+  // Allocate host memory for the result matrix C
+  unsigned int size_C = WC * HC;
+  unsigned int bytes_C = sizeof(float) * size_C;
+  float* h_C = (float*) malloc(bytes_C);
+
+   // Compute using OpenCL
+  rtype Output = computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, HA);
+
+  if(checkResults(h_A, h_B, Output.Out))
+    printf("\nPass!\n");
+  else
+    printf("\nFailed!\n");
+  printf("\nDone!\n");
+
+  // Deallocate memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+}
+
diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll
new file mode 100644
index 0000000000..2fef62369a
--- /dev/null
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll
@@ -0,0 +1,412 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
+; RUN: %t.bin
+; ModuleID = 'gemm_opencl.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
+@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
+@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
+@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
+@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
+@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
+@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
+@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
+@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
+@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
+
+; Function Attrs: nounwind uwtable
+define void @randomInit(float* nocapture %data, i32 %size) #0 {
+entry:
+  %cmp3 = icmp sgt i32 %size, 0
+  br i1 %cmp3, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %call = tail call i32 @rand() #5
+  %conv = sitofp i32 %call to float
+  %div = fmul float %conv, 0x3E00000000000000
+  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
+  store float %div, float* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @rand() #1
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @isEqual(float %a, float %b) #2 {
+entry:
+  %sub = fsub float %a, %b
+  %fabsf = tail call float @fabsf(float %sub) #6
+  %0 = fpext float %fabsf to double
+  %cmp = fcmp olt double %0, 1.000000e-03
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
+entry:
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %entry, %for.inc50
+  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
+  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
+  %0 = shl nsw i64 %indvars.iv92, 10
+  br label %for.body7
+
+for.cond4:                                        ; preds = %for.end
+  %inc48 = add nsw i32 %j.079, 1
+  %1 = trunc i64 %indvars.iv.next89 to i32
+  %cmp5 = icmp slt i32 %1, 1024
+  br i1 %cmp5, label %for.body7, label %for.inc50
+
+for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
+  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
+  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
+  %2 = add nsw i64 %indvars.iv88, %0
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12, %for.body7
+  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
+  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
+  %4 = add nsw i64 %indvars.iv, %0
+  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
+  %5 = load float* %arrayidx16, align 4, !tbaa !0
+  %6 = shl i64 %indvars.iv, 10
+  %7 = add nsw i64 %6, %indvars.iv88
+  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
+  %8 = load float* %arrayidx20, align 4, !tbaa !0
+  %mul21 = fmul float %5, %8
+  %add26 = fadd float %3, %mul21
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body12
+
+for.end:                                          ; preds = %for.body12
+  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
+  %9 = load float* %arrayidx34, align 4, !tbaa !0
+  %sub.i = fsub float %add26, %9
+  %fabsf.i = tail call float @fabsf(float %sub.i) #6
+  %10 = fpext float %fabsf.i to double
+  %cmp.i = fcmp olt double %10, 1.000000e-03
+  %indvars.iv.next89 = add i64 %indvars.iv88, 1
+  br i1 %cmp.i, label %for.cond4, label %if.then
+
+if.then:                                          ; preds = %for.end
+  %conv40 = fpext float %9 to double
+  %conv45 = fpext float %add26 to double
+  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
+  br label %return
+
+for.inc50:                                        ; preds = %for.cond4
+  %indvars.iv.next93 = add i64 %indvars.iv92, 1
+  %inc51 = add nsw i32 %i.081, 1
+  %11 = trunc i64 %indvars.iv.next93 to i32
+  %cmp = icmp slt i32 %11, 1024
+  br i1 %cmp, label %for.cond4.preheader, label %return
+
+return:                                           ; preds = %for.inc50, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #1
+
+; --------------- VISC Intrinsics ---------------
+; Return Type of VISC Compute Matrix Mul
+%rtype = type {float*, i32}
+%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.launch(i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.visc.wait(i8*) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode(i8*) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getNode() #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getParentNode(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumDims(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.input(i8*, i32, i32)
+
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.output(i8*, i32, i32)
+; ----------------- VISC intrinsics end ------------------
+
+; Function Attrs: nounwind uwtable
+define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
+entry:
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
+  
+  ; ------------------------- VISC changes ------------------
+  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
+  ; Replaced statement -- 
+  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
+  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
+  %this_node = call i8* @llvm.visc.getNode()
+  %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
+  %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
+  ; ---------------------- VISC changes End ------------------
+
+  %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
+  %cmp44 = icmp eq i32 %k, 0
+  br i1 %cmp44, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %mul = mul i32 %call2, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %add = add i32 %0, %mul
+  %mul4 = mul i32 %0, %n
+  %add5 = add i32 %mul4, %call1
+  %call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
+  %1 = load float* %arrayidx, align 4, !tbaa !0
+  %idxprom11 = zext i32 %add5 to i64
+  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
+  %2 = load float* %arrayidx12, align 4, !tbaa !0
+  %mul13 = fmul float %1, %2
+  %add14 = fadd float %res.046, %mul13
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
+  %puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
+  %mul16 = mul i32 %call2, %n
+  %add17 = add i32 %mul16, %call1
+  %idxprom18 = zext i32 %add17 to i64
+  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
+  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
+  %puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
+  %puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
+  %.fca.0.insert = insertvalue %rtype undef, float* %C, 0
+  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1
+  ret %rtype %.fca.1.insert
+}
+
+; ----------------- VISC SGEMM root node ----------------
+define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
+  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
+  ; Bind Inputs
+  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
+  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
+  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
+  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
+  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
+  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
+  ; Bind Outputs
+  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C
+  call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C
+  ret %rtype zeroinitializer
+}
+
+; Function Attrs: noinline nounwind uwtable
+;define %rtype @computeMatrixMul(float* nocapture %h_A, i32 %bytes_A, float* nocapture %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
+;entry:
+;  %cmp18 = icmp eq i32 %m, 0
+;  %cmp215 = icmp eq i32 %n, 0
+;  %or.cond = or i1 %cmp18, %cmp215
+;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
+;
+;for.inc4.us:                                      ; preds = %for.body3.us
+;  %0 = extractvalue %rtype %call.us, 0
+;  %1 = extractvalue %rtype %call.us, 1
+;  %inc5.us = add i32 %i.019.us, 1
+;  %exitcond24 = icmp eq i32 %inc5.us, %m
+;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
+;
+;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
+;  %call.us = tail call %rtype @matrixMul(float* %h_A, i32 undef, float* %h_B, i32 undef, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
+;  %inc.us = add i32 %j.016.us, 1
+;  %exitcond = icmp eq i32 %inc.us, %n
+;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
+;
+;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
+;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
+;  br label %for.body3.us
+;
+;for.end6:                                         ; preds = %for.inc4.us, %entry
+;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
+;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
+;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
+;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
+;  ret %rtype %.fca.1.insert
+;}
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
+entry:
+  tail call void @srand(i32 2006) #5
+  %call = tail call noalias i8* @malloc(i64 4194304) #5
+  %0 = bitcast i8* %call to float*
+  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
+  %call.i = tail call i32 @rand() #5
+  %conv.i = sitofp i32 %call.i to float
+  %div.i = fmul float %conv.i, 0x3E00000000000000
+  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
+  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
+  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
+  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
+  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
+  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
+
+for.body.i40.preheader:                           ; preds = %for.body.i
+  %1 = bitcast i8* %call7 to float*
+  br label %for.body.i40
+
+for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
+  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
+  %call.i33 = tail call i32 @rand() #5
+  %conv.i34 = sitofp i32 %call.i33 to float
+  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
+  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
+  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
+  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
+  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
+
+randomInit.exit41:                                ; preds = %for.body.i40
+  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
+  %2 = bitcast i8* %call12 to float*
+
+  ; ---------------------------------- Adding VISC Launch Call --------------------------------
+  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
+  ; Setting up launch input args
+  %in.addr = alloca %struct.arg
+
+  ; Store arguments
+  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
+  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
+  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
+  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
+  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
+  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
+  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
+  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
+  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
+
+  store float* %0, float** %in.addr.h_A
+  store i32 4194304, i32* %in.addr.bytes_A
+  store float* %1, float** %in.addr.h_B
+  store i32 4194304, i32* %in.addr.bytes_B
+  store float* %2, float** %in.addr.h_C
+  store i32 4194304, i32* %in.addr.bytes_C
+  store i32 1024, i32* %in.addr.WA
+  store i32 1024, i32* %in.addr.WB
+  store i32 1024, i32* %in.addr.HA
+
+  ; Change type to i8* and VISC Launch call
+  %args = bitcast %struct.arg* %in.addr to i8*
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
+  ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304)
+
+  ; Wait for result
+  call void @llvm.visc.wait(i8* %graphID)
+
+  ; Get the result
+  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
+  %out = load %rtype* %out.addr
+  ; -------------------------------- Completed VISC Launch Call --------------------------------  
+  
+  %3 = extractvalue %rtype %out, 0
+  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3)
+  %tobool = icmp eq i32 %call14, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %randomInit.exit41
+  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %randomInit.exit41
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
+  tail call void @free(i8* %call) #5
+  tail call void @free(i8* %call7) #5
+  tail call void @free(i8* %call12) #5
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @srand(i32) #1
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #1
+
+declare float @fabsf(float)
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture) #5
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
index dfdbb8310a..fe287e55ac 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
@@ -163,10 +163,10 @@ declare i8* @llvm.visc.getParentNode(i8*) #0
 declare i32 @llvm.visc.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
+declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
 
 ; Function Attrs: nounwind
 declare void @llvm.visc.bind.input(i8*, i32, i32)
@@ -190,11 +190,11 @@ entry:
   ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
   ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3
   %this_node = call i8* @llvm.visc.getNode()
-  %call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
   
   ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
   ; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3
-  %call2 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
+  %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
   ; ---------------------- VISC changes End ------------------
   %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3
   %cmp32 = icmp sgt i32 %k, 0
@@ -208,6 +208,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ]
+  ;%calln = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i64 %indvars.iv, i32 %call2) #6
   %1 = add nsw i64 %indvars.iv, %0
   %arrayidx = getelementptr inbounds float* %A, i64 %1
   %2 = load float* %arrayidx, align 4, !tbaa !0
-- 
GitLab