Skip to content
Snippets Groups Projects
Commit ead1071c authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

Matrix Multiply example added

parent 7dcb7f01
No related branches found
No related tags found
No related merge requests found
PASSES :=
.PHONY: clean
LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
HOST:=gemm_opencl
KERNELS:=matrixMul
LLVM_CC:=$(LLVM_INSTALL)/bin/clang
LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
$(KERNELS:%=%.ll):%.ll:%.cl
$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
$(LLVM_CC) -O3 -target nvptx $< -S -o $@
$(HOST:%=%.ll):%.ll:%.c
$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
$(HOST:%=%.bin):%.bin:%.c
$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
clean :
rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#define WA 1024
#define HA 1024
#define WB 1024
#define HB WA
#define WC WB
#define HC HA
// Thread block size
#define BLOCK_SIZE 16
// Allocates a matrix with random float entries.
void randomInit(float* data, int size) {
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
// Check bool
int isEqual(float a, float b) {
return (fabs(a-b) < 0.001);
}
// Check Results
__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
unsigned int size_A = WA * HA;
unsigned int size_B = WB * HB;
unsigned int size_C = WC * HC;
unsigned int bytesC = sizeof(float) * size_C;
float* goldC = (float*) malloc(bytesC);
for (int i=0; i < HC; i++) {
for (int j=0; j < WC; j++) {
goldC[i*WC + j] = 0;
for (int k=0; k < HB; k++) {
goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
}
if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
return 0;
}
}
}
return 1; // Success
}
typedef struct {
float* Out;
int bytes_Out;
} rtype;
rtype matrixMul(float* A, int bytes_A, float* B, int bytes_B, float* C, int bytes_C, unsigned k, unsigned n, unsigned m, int idx_x, int idx_y) {
printf("Entered function\n");
int tx = get_global_id(0); //2D Global Thread ID x
int ty = get_global_id(1); //2D Global Thread ID y
//int tx = get_global_id(0); //2D Global Thread ID x
//int ty = get_global_id(1); //2D Global Thread ID y
printf("Computing element (%d, %d)\n", tx, ty);
// Initialize accumulator
float res = 0.0f;
// Perform dot-product of row-column
for (int i = 0; i < k; i++) {
printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
res += A[ty*k+i] * B[i*n+tx];
}
printf("Result computed\n");
// Write in device memory
C[ty*n+tx] = res;
printf("Result written to C\n");
rtype Output;
Output.Out = C;
Output.bytes_Out = bytes_C;
printf("Output allocated\n");
return Output;
}
// CPU Computation of MatrixMul
__attribute__ ((noinline)) rtype computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C, unsigned k, unsigned n, unsigned m ) {
rtype Out;
for(unsigned i=0; i<m; i++) {
for(unsigned j=0; j < n; j++) {
Out = matrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, k, n, m, i, j);
}
}
return Out;
}
// Main
int main(int argc, char** argv) {
// seed for rand()
srand(2006);
// Allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int bytes_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(bytes_A);
unsigned int size_B = WB * HB;
unsigned int bytes_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(bytes_B);
// Initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
/*
// Print A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}
*/
// Allocate host memory for the result matrix C
unsigned int size_C = WC * HC;
unsigned int bytes_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(bytes_C);
// Compute using OpenCL
rtype Output = computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, HA);
if(checkResults(h_A, h_B, Output.Out))
printf("\nPass!\n");
else
printf("\nFailed!\n");
printf("\nDone!\n");
// Deallocate memory
free(h_A);
free(h_B);
free(h_C);
}
; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
; RUN: %t.bin
; ModuleID = 'gemm_opencl.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
; Function Attrs: nounwind uwtable
define void @randomInit(float* nocapture %data, i32 %size) #0 {
entry:
%cmp3 = icmp sgt i32 %size, 0
br i1 %cmp3, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%call = tail call i32 @rand() #5
%conv = sitofp i32 %call to float
%div = fmul float %conv, 0x3E00000000000000
%arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
store float %div, float* %arrayidx, align 4, !tbaa !0
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %size
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
; Function Attrs: nounwind
declare i32 @rand() #1
; Function Attrs: nounwind readnone uwtable
define i32 @isEqual(float %a, float %b) #2 {
entry:
%sub = fsub float %a, %b
%fabsf = tail call float @fabsf(float %sub) #6
%0 = fpext float %fabsf to double
%cmp = fcmp olt double %0, 1.000000e-03
%conv1 = zext i1 %cmp to i32
ret i32 %conv1
}
; Function Attrs: noinline nounwind uwtable
define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
entry:
br label %for.cond4.preheader
for.cond4.preheader: ; preds = %entry, %for.inc50
%indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
%i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
%0 = shl nsw i64 %indvars.iv92, 10
br label %for.body7
for.cond4: ; preds = %for.end
%inc48 = add nsw i32 %j.079, 1
%1 = trunc i64 %indvars.iv.next89 to i32
%cmp5 = icmp slt i32 %1, 1024
br i1 %cmp5, label %for.body7, label %for.inc50
for.body7: ; preds = %for.cond4.preheader, %for.cond4
%indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
%j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
%2 = add nsw i64 %indvars.iv88, %0
br label %for.body12
for.body12: ; preds = %for.body12, %for.body7
%indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
%3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
%4 = add nsw i64 %indvars.iv, %0
%arrayidx16 = getelementptr inbounds float* %A, i64 %4
%5 = load float* %arrayidx16, align 4, !tbaa !0
%6 = shl i64 %indvars.iv, 10
%7 = add nsw i64 %6, %indvars.iv88
%arrayidx20 = getelementptr inbounds float* %B, i64 %7
%8 = load float* %arrayidx20, align 4, !tbaa !0
%mul21 = fmul float %5, %8
%add26 = fadd float %3, %mul21
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 1024
br i1 %exitcond, label %for.end, label %for.body12
for.end: ; preds = %for.body12
%arrayidx34 = getelementptr inbounds float* %C, i64 %2
%9 = load float* %arrayidx34, align 4, !tbaa !0
%sub.i = fsub float %add26, %9
%fabsf.i = tail call float @fabsf(float %sub.i) #6
%10 = fpext float %fabsf.i to double
%cmp.i = fcmp olt double %10, 1.000000e-03
%indvars.iv.next89 = add i64 %indvars.iv88, 1
br i1 %cmp.i, label %for.cond4, label %if.then
if.then: ; preds = %for.end
%conv40 = fpext float %9 to double
%conv45 = fpext float %add26 to double
%call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
br label %return
for.inc50: ; preds = %for.cond4
%indvars.iv.next93 = add i64 %indvars.iv92, 1
%inc51 = add nsw i32 %i.081, 1
%11 = trunc i64 %indvars.iv.next93 to i32
%cmp = icmp slt i32 %11, 1024
br i1 %cmp, label %for.cond4.preheader, label %return
return: ; preds = %for.inc50, %if.then
%retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
ret i32 %retval.0
}
; Function Attrs: nounwind
declare noalias i8* @malloc(i64) #1
; Function Attrs: nounwind
declare i32 @printf(i8* nocapture, ...) #1
; --------------- VISC Intrinsics ---------------
; Return Type of VISC Compute Matrix Mul
%rtype = type {float*, i32}
%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }
; Function Attrs: nounwind
declare i8* @llvm.visc.launch(i8*, i8*) #0
; Function Attrs: nounwind
declare void @llvm.visc.wait(i8*) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.createNode(i8*) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.createNode1D(i8*, i32) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
; Function Attrs: nounwind
declare i8* @llvm.visc.getNode() #0
; Function Attrs: nounwind
declare i8* @llvm.visc.getParentNode(i8*) #0
; Function Attrs: nounwind
declare i32 @llvm.visc.getNumDims(i8*) #0
; Function Attrs: nounwind
declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
; Function Attrs: nounwind
declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
; Function Attrs: nounwind
declare void @llvm.visc.bind.input(i8*, i32, i32)
; Function Attrs: nounwind
declare void @llvm.visc.bind.output(i8*, i32, i32)
; ----------------- VISC intrinsics end ------------------
; Function Attrs: nounwind uwtable
define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
entry:
%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
; ------------------------- VISC changes ------------------
; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
; Replaced statement --
; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
%this_node = call i8* @llvm.visc.getNode()
%call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
%call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
; ---------------------- VISC changes End ------------------
%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
%cmp44 = icmp eq i32 %k, 0
br i1 %cmp44, label %for.end, label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%mul = mul i32 %call2, %k
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
%0 = trunc i64 %indvars.iv to i32
%add = add i32 %0, %mul
%mul4 = mul i32 %0, %n
%add5 = add i32 %mul4, %call1
%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
%idxprom = zext i32 %add to i64
%arrayidx = getelementptr inbounds float* %A, i64 %idxprom
%1 = load float* %arrayidx, align 4, !tbaa !0
%idxprom11 = zext i32 %add5 to i64
%arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
%2 = load float* %arrayidx12, align 4, !tbaa !0
%mul13 = fmul float %1, %2
%add14 = fadd float %res.046, %mul13
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %k
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
%res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
%mul16 = mul i32 %call2, %n
%add17 = add i32 %mul16, %call1
%idxprom18 = zext i32 %add17 to i64
%arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
%.fca.0.insert = insertvalue %rtype undef, float* %C, 0
%.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1
ret %rtype %.fca.1.insert
}
; ----------------- VISC SGEMM root node ----------------
define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
%kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
; Bind Inputs
call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
; Bind Outputs
call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C
call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C
ret %rtype zeroinitializer
}
; Function Attrs: noinline nounwind uwtable
;define %rtype @computeMatrixMul(float* nocapture %h_A, i32 %bytes_A, float* nocapture %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
;entry:
; %cmp18 = icmp eq i32 %m, 0
; %cmp215 = icmp eq i32 %n, 0
; %or.cond = or i1 %cmp18, %cmp215
; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
;
;for.inc4.us: ; preds = %for.body3.us
; %0 = extractvalue %rtype %call.us, 0
; %1 = extractvalue %rtype %call.us, 1
; %inc5.us = add i32 %i.019.us, 1
; %exitcond24 = icmp eq i32 %inc5.us, %m
; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
;
;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us
; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
; %call.us = tail call %rtype @matrixMul(float* %h_A, i32 undef, float* %h_B, i32 undef, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
; %inc.us = add i32 %j.016.us, 1
; %exitcond = icmp eq i32 %inc.us, %n
; br i1 %exitcond, label %for.inc4.us, label %for.body3.us
;
;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us
; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
; br label %for.body3.us
;
;for.end6: ; preds = %for.inc4.us, %entry
; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
; ret %rtype %.fca.1.insert
;}
; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
entry:
tail call void @srand(i32 2006) #5
%call = tail call noalias i8* @malloc(i64 4194304) #5
%0 = bitcast i8* %call to float*
%call7 = tail call noalias i8* @malloc(i64 4194304) #5
br label %for.body.i
for.body.i: ; preds = %for.body.i, %entry
%indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
%call.i = tail call i32 @rand() #5
%conv.i = sitofp i32 %call.i to float
%div.i = fmul float %conv.i, 0x3E00000000000000
%arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
%indvars.iv.next.i = add i64 %indvars.iv.i, 1
%lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
%exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
for.body.i40.preheader: ; preds = %for.body.i
%1 = bitcast i8* %call7 to float*
br label %for.body.i40
for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40
%indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
%call.i33 = tail call i32 @rand() #5
%conv.i34 = sitofp i32 %call.i33 to float
%div.i35 = fmul float %conv.i34, 0x3E00000000000000
%arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
%indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
%lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
%exitcond = icmp eq i32 %lftr.wideiv, 1048576
br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
randomInit.exit41: ; preds = %for.body.i40
%call12 = tail call noalias i8* @malloc(i64 4194304) #5
%2 = bitcast i8* %call12 to float*
; ---------------------------------- Adding VISC Launch Call --------------------------------
; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
; Setting up launch input args
%in.addr = alloca %struct.arg
; Store arguments
%in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
%in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
%in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
%in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
%in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
%in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
%in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
%in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
%in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
store float* %0, float** %in.addr.h_A
store i32 4194304, i32* %in.addr.bytes_A
store float* %1, float** %in.addr.h_B
store i32 4194304, i32* %in.addr.bytes_B
store float* %2, float** %in.addr.h_C
store i32 4194304, i32* %in.addr.bytes_C
store i32 1024, i32* %in.addr.WA
store i32 1024, i32* %in.addr.WB
store i32 1024, i32* %in.addr.HA
; Change type to i8* and VISC Launch call
%args = bitcast %struct.arg* %in.addr to i8*
%graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304)
; Wait for result
call void @llvm.visc.wait(i8* %graphID)
; Get the result
%out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
%out = load %rtype* %out.addr
; -------------------------------- Completed VISC Launch Call --------------------------------
%3 = extractvalue %rtype %out, 0
%call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3)
%tobool = icmp eq i32 %call14, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %randomInit.exit41
%puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
br label %if.end
if.else: ; preds = %randomInit.exit41
%puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
br label %if.end
if.end: ; preds = %if.else, %if.then
%puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
tail call void @free(i8* %call) #5
tail call void @free(i8* %call7) #5
tail call void @free(i8* %call12) #5
ret i32 0
}
; Function Attrs: nounwind
declare void @srand(i32) #1
; Function Attrs: nounwind
declare void @free(i8* nocapture) #1
declare float @fabsf(float)
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture) #5
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind }
attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
!0 = metadata !{metadata !"float", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA"}
...@@ -163,10 +163,10 @@ declare i8* @llvm.visc.getParentNode(i8*) #0 ...@@ -163,10 +163,10 @@ declare i8* @llvm.visc.getParentNode(i8*) #0
declare i32 @llvm.visc.getNumDims(i8*) #0 declare i32 @llvm.visc.getNumDims(i8*) #0
; Function Attrs: nounwind ; Function Attrs: nounwind
declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
; Function Attrs: nounwind ; Function Attrs: nounwind
declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
; Function Attrs: nounwind ; Function Attrs: nounwind
declare void @llvm.visc.bind.input(i8*, i32, i32) declare void @llvm.visc.bind.input(i8*, i32, i32)
...@@ -190,11 +190,11 @@ entry: ...@@ -190,11 +190,11 @@ entry:
; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3 ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3
%this_node = call i8* @llvm.visc.getNode() %this_node = call i8* @llvm.visc.getNode()
%call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3 ; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3
%call2 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
; ---------------------- VISC changes End ------------------ ; ---------------------- VISC changes End ------------------
%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3 %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3
%cmp32 = icmp sgt i32 %k, 0 %cmp32 = icmp sgt i32 %k, 0
...@@ -208,6 +208,7 @@ for.body.lr.ph: ; preds = %entry ...@@ -208,6 +208,7 @@ for.body.lr.ph: ; preds = %entry
for.body: ; preds = %for.body, %for.body.lr.ph for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ] %res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ]
;%calln = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i64 %indvars.iv, i32 %call2) #6
%1 = add nsw i64 %indvars.iv, %0 %1 = add nsw i64 %indvars.iv, %0
%arrayidx = getelementptr inbounds float* %A, i64 %1 %arrayidx = getelementptr inbounds float* %A, i64 %1
%2 = load float* %arrayidx, align 4, !tbaa !0 %2 = load float* %arrayidx, align 4, !tbaa !0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment