diff --git a/llvm/test/VISC/MatrixMultiplication/Makefile b/llvm/test/VISC/MatrixMultiplication/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6e4d4a0521cf3336df24931c14d518647ac8a7fc --- /dev/null +++ b/llvm/test/VISC/MatrixMultiplication/Makefile @@ -0,0 +1,30 @@ +PASSES := + +.PHONY: clean + +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc +HOST:=gemm_opencl +KERNELS:=matrixMul +LLVM_CC:=$(LLVM_INSTALL)/bin/clang +LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link + +all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin) + +$(KERNELS:%=%.ll):%.ll:%.cl + $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ + +$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll + $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ + +$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc + $(LLVM_CC) -O3 -target nvptx $< -S -o $@ + +$(HOST:%=%.ll):%.ll:%.c + $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@ + +$(HOST:%=%.bin):%.bin:%.c + $(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@ + +clean : + rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin diff --git a/llvm/test/VISC/MatrixMultiplication/gemm.c b/llvm/test/VISC/MatrixMultiplication/gemm.c new file mode 100644 index 0000000000000000000000000000000000000000..7356b8293ddba0c4cd8101649dc10fcd41c2a600 --- /dev/null +++ b/llvm/test/VISC/MatrixMultiplication/gemm.c @@ -0,0 +1,168 @@ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include <string.h> + +#define WA 1024 +#define HA 1024 +#define WB 1024 +#define HB WA +#define WC WB +#define HC HA + + + +// Thread block size +#define BLOCK_SIZE 16 + +// Allocates a matrix with random float entries. +void randomInit(float* data, int size) { + for (int i = 0; i < size; ++i) + data[i] = rand() / (float)RAND_MAX; +} + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program file. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param cFilename program filename +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// + +// Check bool +int isEqual(float a, float b) { + return (fabs(a-b) < 0.001); +} + +// Check Results + +__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { + unsigned int size_A = WA * HA; + unsigned int size_B = WB * HB; + unsigned int size_C = WC * HC; + unsigned int bytesC = sizeof(float) * size_C; + float* goldC = (float*) malloc(bytesC); + for (int i=0; i < HC; i++) { + for (int j=0; j < WC; j++) { + goldC[i*WC + j] = 0; + for (int k=0; k < HB; k++) { + goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; + } + if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { + printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); + return 0; + } + } + } + return 1; // Success +} + + +typedef struct { + float* Out; + int bytes_Out; +} rtype; + +rtype matrixMul(float* A, int bytes_A, float* B, int bytes_B, float* C, int bytes_C, unsigned k, unsigned n, unsigned m, int idx_x, int idx_y) { + + printf("Entered function\n"); + int tx = get_global_id(0); //2D Global Thread ID x + int ty = get_global_id(1); //2D Global Thread ID y + //int tx = get_global_id(0); //2D Global Thread ID x + //int ty = get_global_id(1); //2D Global Thread ID y + + printf("Computing element (%d, %d)\n", tx, ty); + // Initialize accumulator + float res = 0.0f; + + // Perform dot-product of row-column + for (int i = 0; i < k; i++) { + printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); + res += A[ty*k+i] * B[i*n+tx]; + } + + printf("Result computed\n"); + // Write in device memory + C[ty*n+tx] = res; + + printf("Result written to C\n"); + rtype Output; + Output.Out = C; + Output.bytes_Out = bytes_C; + printf("Output allocated\n"); + return Output; + +} + + +// CPU Computation of MatrixMul +__attribute__ ((noinline)) rtype computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C, unsigned k, unsigned n, unsigned m ) { + + rtype Out; + for(unsigned i=0; i<m; i++) { + for(unsigned j=0; j < n; j++) { + Out = matrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, k, n, m, i, j); + } + } + return Out; + +} + +// Main +int main(int argc, char** argv) { + + // seed for rand() + srand(2006); + + // Allocate host memory for matrices A and B + unsigned int size_A = WA * HA; + unsigned int bytes_A = sizeof(float) * size_A; + float* h_A = (float*) malloc(bytes_A); + + unsigned int size_B = WB * HB; + unsigned int bytes_B = sizeof(float) * size_B; + float* h_B = (float*) malloc(bytes_B); + + // Initialize host memory + randomInit(h_A, size_A); + randomInit(h_B, size_B); + +/* + // Print A and B + printf("\n\nMatrix A\n"); + for(int i = 0; i < size_A; i++) + { + printf("%f ", h_A[i]); + if(((i + 1) % WA) == 0) + printf("\n"); + } + + printf("\n\nMatrix B\n"); + for(int i = 0; i < size_B; i++) + { + printf("%f ", h_B[i]); + if(((i + 1) % WB) == 0) + printf("\n"); + } +*/ + + // Allocate host memory for the result matrix C + unsigned int size_C = WC * HC; + unsigned int bytes_C = sizeof(float) * size_C; + float* h_C = (float*) malloc(bytes_C); + + // Compute using OpenCL + rtype Output = computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, HA); + + if(checkResults(h_A, h_B, Output.Out)) + printf("\nPass!\n"); + else + printf("\nFailed!\n"); + printf("\nDone!\n"); + + // Deallocate memory + free(h_A); + free(h_B); + free(h_C); +} + diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll new file mode 100644 index 0000000000000000000000000000000000000000..2fef62369aa635a08e8ca6208633e6bdd5c6d37e --- /dev/null +++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm.ll @@ -0,0 +1,412 @@ +; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s +; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll +; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin +; RUN: %t.bin +; ModuleID = 'gemm_opencl.c' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 +@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 +@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 +@str = private unnamed_addr constant [17 x i8] c"Entered function\00" +@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" +@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" +@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" +@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" +@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" +@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" + +; Function Attrs: nounwind uwtable +define void @randomInit(float* nocapture %data, i32 %size) #0 { +entry: + %cmp3 = icmp sgt i32 %size, 0 + br i1 %cmp3, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %call = tail call i32 @rand() #5 + %conv = sitofp i32 %call to float + %div = fmul float %conv, 0x3E00000000000000 + %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv + store float %div, float* %arrayidx, align 4, !tbaa !0 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Function Attrs: nounwind +declare i32 @rand() #1 + +; Function Attrs: nounwind readnone uwtable +define i32 @isEqual(float %a, float %b) #2 { +entry: + %sub = fsub float %a, %b + %fabsf = tail call float @fabsf(float %sub) #6 + %0 = fpext float %fabsf to double + %cmp = fcmp olt double %0, 1.000000e-03 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: noinline nounwind uwtable +define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { +entry: + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %entry, %for.inc50 + %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] + %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] + %0 = shl nsw i64 %indvars.iv92, 10 + br label %for.body7 + +for.cond4: ; preds = %for.end + %inc48 = add nsw i32 %j.079, 1 + %1 = trunc i64 %indvars.iv.next89 to i32 + %cmp5 = icmp slt i32 %1, 1024 + br i1 %cmp5, label %for.body7, label %for.inc50 + +for.body7: ; preds = %for.cond4.preheader, %for.cond4 + %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] + %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] + %2 = add nsw i64 %indvars.iv88, %0 + br label %for.body12 + +for.body12: ; preds = %for.body12, %for.body7 + %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] + %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] + %4 = add nsw i64 %indvars.iv, %0 + %arrayidx16 = getelementptr inbounds float* %A, i64 %4 + %5 = load float* %arrayidx16, align 4, !tbaa !0 + %6 = shl i64 %indvars.iv, 10 + %7 = add nsw i64 %6, %indvars.iv88 + %arrayidx20 = getelementptr inbounds float* %B, i64 %7 + %8 = load float* %arrayidx20, align 4, !tbaa !0 + %mul21 = fmul float %5, %8 + %add26 = fadd float %3, %mul21 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %for.end, label %for.body12 + +for.end: ; preds = %for.body12 + %arrayidx34 = getelementptr inbounds float* %C, i64 %2 + %9 = load float* %arrayidx34, align 4, !tbaa !0 + %sub.i = fsub float %add26, %9 + %fabsf.i = tail call float @fabsf(float %sub.i) #6 + %10 = fpext float %fabsf.i to double + %cmp.i = fcmp olt double %10, 1.000000e-03 + %indvars.iv.next89 = add i64 %indvars.iv88, 1 + br i1 %cmp.i, label %for.cond4, label %if.then + +if.then: ; preds = %for.end + %conv40 = fpext float %9 to double + %conv45 = fpext float %add26 to double + %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 + br label %return + +for.inc50: ; preds = %for.cond4 + %indvars.iv.next93 = add i64 %indvars.iv92, 1 + %inc51 = add nsw i32 %i.081, 1 + %11 = trunc i64 %indvars.iv.next93 to i32 + %cmp = icmp slt i32 %11, 1024 + br i1 %cmp, label %for.cond4.preheader, label %return + +return: ; preds = %for.inc50, %if.then + %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] + ret i32 %retval.0 +} + +; Function Attrs: nounwind +declare noalias i8* @malloc(i64) #1 + +; Function Attrs: nounwind +declare i32 @printf(i8* nocapture, ...) #1 + +; --------------- VISC Intrinsics --------------- +; Return Type of VISC Compute Matrix Mul +%rtype = type {float*, i32} +%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype } + +; Function Attrs: nounwind +declare i8* @llvm.visc.launch(i8*, i8*) #0 + +; Function Attrs: nounwind +declare void @llvm.visc.wait(i8*) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode(i8*) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode1D(i8*, i32) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getNode() #0 + +; Function Attrs: nounwind +declare i8* @llvm.visc.getParentNode(i8*) #0 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNumDims(i8*) #0 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 + +; Function Attrs: nounwind +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 + +; Function Attrs: nounwind +declare void @llvm.visc.bind.input(i8*, i32, i32) + +; Function Attrs: nounwind +declare void @llvm.visc.bind.output(i8*, i32, i32) +; ----------------- VISC intrinsics end ------------------ + +; Function Attrs: nounwind uwtable +define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { +entry: + %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) + + ; ------------------------- VISC changes ------------------ + ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x + ; Replaced statement -- + ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 + ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 + %this_node = call i8* @llvm.visc.getNode() + %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) + %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) + ; ---------------------- VISC changes End ------------------ + + %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 + %cmp44 = icmp eq i32 %k, 0 + br i1 %cmp44, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %mul = mul i32 %call2, %k + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] + %0 = trunc i64 %indvars.iv to i32 + %add = add i32 %0, %mul + %mul4 = mul i32 %0, %n + %add5 = add i32 %mul4, %call1 + %call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr inbounds float* %A, i64 %idxprom + %1 = load float* %arrayidx, align 4, !tbaa !0 + %idxprom11 = zext i32 %add5 to i64 + %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 + %2 = load float* %arrayidx12, align 4, !tbaa !0 + %mul13 = fmul float %1, %2 + %add14 = fadd float %res.046, %mul13 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] + %puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) + %mul16 = mul i32 %call2, %n + %add17 = add i32 %mul16, %call1 + %idxprom18 = zext i32 %add17 to i64 + %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 + store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 + %puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) + %puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) + %.fca.0.insert = insertvalue %rtype undef, float* %C, 0 + %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1 + ret %rtype %.fca.1.insert +} + +; ----------------- VISC SGEMM root node ---------------- +define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { + %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) + ; Bind Inputs + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k + call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n + call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m + ; Bind Outputs + call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C + call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C + ret %rtype zeroinitializer +} + +; Function Attrs: noinline nounwind uwtable +;define %rtype @computeMatrixMul(float* nocapture %h_A, i32 %bytes_A, float* nocapture %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { +;entry: +; %cmp18 = icmp eq i32 %m, 0 +; %cmp215 = icmp eq i32 %n, 0 +; %or.cond = or i1 %cmp18, %cmp215 +; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us +; +;for.inc4.us: ; preds = %for.body3.us +; %0 = extractvalue %rtype %call.us, 0 +; %1 = extractvalue %rtype %call.us, 1 +; %inc5.us = add i32 %i.019.us, 1 +; %exitcond24 = icmp eq i32 %inc5.us, %m +; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us +; +;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us +; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] +; %call.us = tail call %rtype @matrixMul(float* %h_A, i32 undef, float* %h_B, i32 undef, float* %h_C, i32 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) +; %inc.us = add i32 %j.016.us, 1 +; %exitcond = icmp eq i32 %inc.us, %n +; br i1 %exitcond, label %for.inc4.us, label %for.body3.us +; +;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us +; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] +; br label %for.body3.us +; +;for.end6: ; preds = %for.inc4.us, %entry +; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] +; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] +; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 +; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 +; ret %rtype %.fca.1.insert +;} + +; Function Attrs: nounwind uwtable +define i32 @main(i32 %argc, i8** nocapture %argv) #0 { +entry: + tail call void @srand(i32 2006) #5 + %call = tail call noalias i8* @malloc(i64 4194304) #5 + %0 = bitcast i8* %call to float* + %call7 = tail call noalias i8* @malloc(i64 4194304) #5 + br label %for.body.i + +for.body.i: ; preds = %for.body.i, %entry + %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] + %call.i = tail call i32 @rand() #5 + %conv.i = sitofp i32 %call.i to float + %div.i = fmul float %conv.i, 0x3E00000000000000 + %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i + store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 + %indvars.iv.next.i = add i64 %indvars.iv.i, 1 + %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 + %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 + br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i + +for.body.i40.preheader: ; preds = %for.body.i + %1 = bitcast i8* %call7 to float* + br label %for.body.i40 + +for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 + %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] + %call.i33 = tail call i32 @rand() #5 + %conv.i34 = sitofp i32 %call.i33 to float + %div.i35 = fmul float %conv.i34, 0x3E00000000000000 + %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 + store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 + %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1048576 + br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 + +randomInit.exit41: ; preds = %for.body.i40 + %call12 = tail call noalias i8* @malloc(i64 4194304) #5 + %2 = bitcast i8* %call12 to float* + + ; ---------------------------------- Adding VISC Launch Call -------------------------------- + ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) + ; Setting up launch input args + %in.addr = alloca %struct.arg + + ; Store arguments + %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + + store float* %0, float** %in.addr.h_A + store i32 4194304, i32* %in.addr.bytes_A + store float* %1, float** %in.addr.h_B + store i32 4194304, i32* %in.addr.bytes_B + store float* %2, float** %in.addr.h_C + store i32 4194304, i32* %in.addr.bytes_C + store i32 1024, i32* %in.addr.WA + store i32 1024, i32* %in.addr.WB + store i32 1024, i32* %in.addr.HA + + ; Change type to i8* and VISC Launch call + %args = bitcast %struct.arg* %in.addr to i8* + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) + ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304) + + ; Wait for result + call void @llvm.visc.wait(i8* %graphID) + + ; Get the result + %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 + %out = load %rtype* %out.addr + ; -------------------------------- Completed VISC Launch Call -------------------------------- + + %3 = extractvalue %rtype %out, 0 + %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3) + %tobool = icmp eq i32 %call14, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %randomInit.exit41 + %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) + br label %if.end + +if.else: ; preds = %randomInit.exit41 + %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) + tail call void @free(i8* %call) #5 + tail call void @free(i8* %call7) #5 + tail call void @free(i8* %call12) #5 + ret i32 0 +} + +; Function Attrs: nounwind +declare void @srand(i32) #1 + +; Function Attrs: nounwind +declare void @free(i8* nocapture) #1 + +declare float @fabsf(float) + +; Function Attrs: nounwind +declare i32 @puts(i8* nocapture) #5 + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } +attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"float", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll index dfdbb8310a25fdb318ab98e9a0949b96c10ecb22..fe287e55ac68f3677e6a2eb528657f2b4e791672 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll +++ b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll @@ -163,10 +163,10 @@ declare i8* @llvm.visc.getParentNode(i8*) #0 declare i32 @llvm.visc.getNumDims(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 +declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 +declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 ; Function Attrs: nounwind declare void @llvm.visc.bind.input(i8*, i32, i32) @@ -190,11 +190,11 @@ entry: ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3 %this_node = call i8* @llvm.visc.getNode() - %call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) + %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x ; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3 - %call2 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) + %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) ; ---------------------- VISC changes End ------------------ %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3 %cmp32 = icmp sgt i32 %k, 0 @@ -208,6 +208,7 @@ for.body.lr.ph: ; preds = %entry for.body: ; preds = %for.body, %for.body.lr.ph %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] %res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ] + ;%calln = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i64 %indvars.iv, i32 %call2) #6 %1 = add nsw i64 %indvars.iv, %0 %arrayidx = getelementptr inbounds float* %A, i64 %1 %2 = load float* %arrayidx, align 4, !tbaa !0