diff --git a/hpvm/test/MatrixMultiplication/Makefile b/hpvm/test/MatrixMultiplication/Makefile deleted file mode 100644 index 94f22ed717fdf89427cc36ae80c053ba0a19c790..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -PASSES := - -.PHONY: clean - -LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc -HOST:=gemm_opencl -KERNELS:=matrixMul -LLVM_CC:=$(LLVM_INSTALL)/bin/clang -LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link - -all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin) - -auto_gemm.ll: auto_gemm.c - ~/current-bin/clang -S -emit-llvm auto_gemm.c -O3 -o auto_gemm.ll -gen: auto_gemm.ll - ~/current-src/Release+Asserts/bin/opt -load ~/current-src/Release+Asserts/lib/LLVMGenVISC.so -genvisc auto_gemm.ll -S -o auto_gemm_visc.ll -$(KERNELS:%=%.ll):%.ll:%.cl - $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ - -$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ - -$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - $(LLVM_CC) -O3 -target nvptx $< -S -o $@ - -$(HOST:%=%.ll):%.ll:%.c - $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@ - -$(HOST:%=%.bin):%.bin:%.c - $(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@ - -clean : - rm -f $(HOST).ll $(KERNELS).ll *.bc *.s nvptx.s* *.bin *.kernels.ll DataflowGraph.dot* diff --git a/hpvm/test/MatrixMultiplication/gemm.c b/hpvm/test/MatrixMultiplication/gemm.c deleted file mode 100644 index 7356b8293ddba0c4cd8101649dc10fcd41c2a600..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/gemm.c +++ /dev/null @@ -1,168 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <string.h> - -#define WA 1024 -#define HA 1024 -#define WB 1024 -#define HB WA -#define WC WB -#define HC HA - - - -// Thread block size -#define BLOCK_SIZE 16 - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -////////////////////////////////////////////////////////////////////////////// -//! Loads a Program file. -//! -//! @return the source string if succeeded, 0 otherwise -//! @param cFilename program filename -//! @param szFinalLength returned length of the code string -////////////////////////////////////////////////////////////////////////////// - -// Check bool -int isEqual(float a, float b) { - return (fabs(a-b) < 0.001); -} - -// Check Results - -__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) { - unsigned int size_A = WA * HA; - unsigned int size_B = WB * HB; - unsigned int size_C = WC * HC; - unsigned int bytesC = sizeof(float) * size_C; - float* goldC = (float*) malloc(bytesC); - for (int i=0; i < HC; i++) { - for (int j=0; j < WC; j++) { - goldC[i*WC + j] = 0; - for (int k=0; k < HB; k++) { - goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; - } - if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { - printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); - return 0; - } - } - } - return 1; // Success -} - - -typedef struct { - float* Out; - int bytes_Out; -} rtype; - -rtype matrixMul(float* A, int bytes_A, float* B, int bytes_B, float* C, int bytes_C, unsigned k, unsigned n, unsigned m, int idx_x, int idx_y) { - - printf("Entered function\n"); - int tx = get_global_id(0); //2D Global Thread ID x - int ty = get_global_id(1); //2D Global Thread ID y - //int tx = get_global_id(0); //2D Global Thread ID x - //int ty = get_global_id(1); //2D Global Thread ID y - - printf("Computing element (%d, %d)\n", tx, ty); - // Initialize accumulator - float res = 0.0f; - - // Perform dot-product of row-column - for (int i = 0; i < k; i++) { - printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx); - res += A[ty*k+i] * B[i*n+tx]; - } - - printf("Result computed\n"); - // Write in device memory - C[ty*n+tx] = res; - - printf("Result written to C\n"); - rtype Output; - Output.Out = C; - Output.bytes_Out = bytes_C; - printf("Output allocated\n"); - return Output; - -} - - -// CPU Computation of MatrixMul -__attribute__ ((noinline)) rtype computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C, unsigned k, unsigned n, unsigned m ) { - - rtype Out; - for(unsigned i=0; i<m; i++) { - for(unsigned j=0; j < n; j++) { - Out = matrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, k, n, m, i, j); - } - } - return Out; - -} - -// Main -int main(int argc, char** argv) { - - // seed for rand() - srand(2006); - - // Allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); - - unsigned int size_B = WB * HB; - unsigned int bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ - - // Allocate host memory for the result matrix C - unsigned int size_C = WC * HC; - unsigned int bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); - - // Compute using OpenCL - rtype Output = computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, HA); - - if(checkResults(h_A, h_B, Output.Out)) - printf("\nPass!\n"); - else - printf("\nFailed!\n"); - printf("\nDone!\n"); - - // Deallocate memory - free(h_A); - free(h_B); - free(h_C); -} - diff --git a/hpvm/test/MatrixMultiplication/visc_gemm.ll b/hpvm/test/MatrixMultiplication/visc_gemm.ll deleted file mode 100644 index 033b481af786dd936b0fbe383f3723d5faf237c6..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm.ll +++ /dev/null @@ -1,416 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {} -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* nocapture in %A, i64 %bytes_A, float* nocapture in %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll deleted file mode 100644 index ed3e3bf0985c24ac5785137be91e67ac298093b4..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll +++ /dev/null @@ -1,458 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc -; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {} -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) - %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) - - %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) - %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) - %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node) - - %tmpx = mul i32 %Gx, %LLimitx - %tmpy = mul i32 %Gy, %LLimity - - %call1 = add i32 %tmpx, %Lx - %call2 = add i32 %tmpy, %Ly - - ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5 - ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5 - - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll deleted file mode 100644 index fc3db521db174e58626a5c4daf109061530bb250..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll +++ /dev/null @@ -1,456 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {} -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) - %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) - - %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) - %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) - %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node) - - %tmpx = mul i32 %Gx, %LLimitx - %tmpy = mul i32 %Gy, %LLimity - - %call1 = add i32 %tmpx, %Lx - %call2 = add i32 %tmpy, %Ly - - ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5 - ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5 - - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll deleted file mode 100644 index b51727e2a674de58cb83cbba2baf024b7aaebfab..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll +++ /dev/null @@ -1,472 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc -; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type <{ i32, i32 }> - -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) - %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) - - %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) - %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) - %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node) - - %tmpx = mul i32 %Gx, %LLimitx - %tmpy = mul i32 %Gy, %LLimity - - %call1 = add i32 %tmpx, %Lx - %call2 = add i32 %tmpy, %Ly - - ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5 - ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5 - - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - - ; Generating bogus output to test correct output mapping - %tmp1 = insertvalue %rtype undef, i32 0, 0 - %tmp2 = insertvalue %rtype %tmp1, i32 1, 1 - ret %rtype %tmp2 -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 1); 1 - call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 0); 0 - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); 0 - call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); 1 - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - %output_0 = extractvalue %rtype %out, 0 - %output_1 = extractvalue %rtype %out, 1 - - %printcall0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %output_0) #5 - %printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %output_1) #5 - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll deleted file mode 100644 index 30ccd8cc4c33e9287a0673c9ce537e6ccfb24b21..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll +++ /dev/null @@ -1,463 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc -; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {} -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) - %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node) - - %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) - %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) - %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node) - - %tmpx = mul i32 %Gx, %LLimitx - %tmpy = mul i32 %Gy, %LLimity - - %call1 = add i32 %tmpx, %Lx - %call2 = add i32 %tmpy, %Ly - - ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5 - ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5 - - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %blocksize) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %blocksize, i32 %blocksize) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %gridsize, i32 %blocksize) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @MatrixMulInternal to i8*),i32 %gridsize, i32 %gridsize) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - call void @llvm.visc.bind.input(i8* %kernel, i32 10, i32 9); blocksize - ; Bind Outputs - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - %in.addr.gridsize = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %in.addr.blocksize = getelementptr %struct.arg* %in.addr, i32 0, i32 10 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - store i32 64, i32* %in.addr.gridsize - store i32 16, i32* %in.addr.blocksize - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 11 - %out = load %rtype* %out.addr - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll b/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll deleted file mode 100644 index 0c3bc24f9dc5575783e3002115cc8976dfb3325a..0000000000000000000000000000000000000000 --- a/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll +++ /dev/null @@ -1,419 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s -; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc -; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s -; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 -@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 -@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 -@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 -@str = private unnamed_addr constant [17 x i8] c"Entered function\00" -@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00" -@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00" -@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #5 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: noinline nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {} -%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> - -; Function Attrs: nounwind -declare void @llvm.visc.init() #1 - -; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #1 - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x - ; Replaced statement -- - ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5 - ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5 - %this_node = call i8* @llvm.visc.getNode() - %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - ; ---------------------- VISC changes End ------------------ - - ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5 - %cmp44 = icmp eq i32 %k, 0 - br i1 %cmp44, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %mul = mul i32 %call2, %k - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %add = add i32 %0, %mul - %mul4 = mul i32 %0, %n - %add5 = add i32 %mul4, %call1 - ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5 - %idxprom = zext i32 %add to i64 - %arrayidx = getelementptr inbounds float* %A, i64 %idxprom - %1 = load float* %arrayidx, align 4, !tbaa !0 - %idxprom11 = zext i32 %add5 to i64 - %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11 - %2 = load float* %arrayidx12, align 4, !tbaa !0 - %mul13 = fmul float %1, %2 - %add14 = fadd float %res.046, %mul13 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ] - ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0)) - %mul16 = mul i32 %call2, %n - %add17 = add i32 %mul16, %call1 - %idxprom18 = zext i32 %add17 to i64 - %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18 - store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 - ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) - ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - ret %rtype undef -} - -; ----------------- VISC SGEMM root node ---------------- -define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - ret %rtype undef -} - -; Function Attrs: noinline nounwind uwtable -;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 { -;entry: -; %cmp18 = icmp eq i32 %m, 0 -; %cmp215 = icmp eq i32 %n, 0 -; %or.cond = or i1 %cmp18, %cmp215 -; br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us -; -;for.inc4.us: ; preds = %for.body3.us -; %0 = extractvalue %rtype %call.us, 0 -; %1 = extractvalue %rtype %call.us, 1 -; %inc5.us = add i32 %i.019.us, 1 -; %exitcond24 = icmp eq i32 %inc5.us, %m -; br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us -; -;for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us -; %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ] -; %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef) -; %inc.us = add i32 %j.016.us, 1 -; %exitcond = icmp eq i32 %inc.us, %n -; br i1 %exitcond, label %for.inc4.us, label %for.body3.us -; -;for.body3.lr.ph.us: ; preds = %entry, %for.inc4.us -; %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ] -; br label %for.body3.us -; -;for.end6: ; preds = %for.inc4.us, %entry -; %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ] -; %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ] -; %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0 -; %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1 -; ret %rtype %.fca.1.insert -;} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #5 - %call = tail call noalias i8* @malloc(i64 4194304) #5 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #5 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #5 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32 - %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576 - br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #5 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1048576 - br i1 %exitcond, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #5 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024) - ; Setting up launch input args - call void @llvm.visc.init() - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i64 4194304, i64* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i64 4194304, i64* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i64 4194304, i64* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - call void @llvm.visc.cleanup() - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) - %tobool = icmp eq i32 %call14, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %randomInit.exit41 - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %randomInit.exit41 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0)) - tail call void @free(i8* %call) #5 - tail call void @free(i8* %call7) #5 - tail call void @free(i8* %call12) #5 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #5 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #5 = { nounwind } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/edge/CMakeLists.txt b/hpvm/test/edge/CMakeLists.txt deleted file mode 100644 index 24dda49022d15ee1c1b3551046780aefe25595e8..0000000000000000000000000000000000000000 --- a/hpvm/test/edge/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project( edgeDetection ) -find_package( OpenCV REQUIRED ) -find_package( Threads REQUIRED ) -add_executable( PipelineEdgeDetect PipelineEdgeDetect.cpp ) -add_executable( EdgeDetect EdgeDetect.cpp ) -target_link_libraries( PipelineEdgeDetect ${OpenCV_LIBS} ${CMAKE_THREAD_LIBS_INIT} ) -target_link_libraries( EdgeDetect ${OpenCV_LIBS} ) -SET( CMAKE_CXX_FLAGS "-std=c++0x" ) diff --git a/hpvm/test/edge/ESO_Very_Large_Telescope.jpg b/hpvm/test/edge/ESO_Very_Large_Telescope.jpg deleted file mode 100644 index 3aed6db383661c42ac69ebfea66b556c66ec934e..0000000000000000000000000000000000000000 Binary files a/hpvm/test/edge/ESO_Very_Large_Telescope.jpg and /dev/null differ diff --git a/hpvm/test/edge/EdgeDetect.cpp b/hpvm/test/edge/EdgeDetect.cpp deleted file mode 100644 index 62cca0cdd0be007087a2e90749289d2c3321354f..0000000000000000000000000000000000000000 --- a/hpvm/test/edge/EdgeDetect.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include "opencv2/opencv.hpp" -#include "opencv2/core/ocl.hpp" -#include <iostream> -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include "time.h" - -#define NUM_FRAMES 200 - -using namespace cv; -using namespace std; - - -/// Global variables -string window_name = "Edge Map"; - - -UMat* GaussianSmoothening(UMat* I, float Sn) { - // Gaussian Smoothening - UMat* IBlur = new UMat(I->rows, I->cols, I->type()); - int kernelSize = 2*ceil(3*Sn)+1; - GaussianBlur(*I, *IBlur, Size(kernelSize, kernelSize), Sn); - return IBlur; -} - -UMat* NonLinearLaplacian(UMat* IBlur, Mat B) { - UMat IErode, IDilate; - UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type()); - erode(*IBlur, IErode, B); - dilate(*IBlur, IDilate, B); - //*L = IErode.getMat(ACCESS_READ) + IDilate.getMat(ACCESS_READ) - (2*IBlur->getMat(ACCESS_READ)); - add(IErode, IDilate, IErode); - add(*IBlur, *IBlur, *IBlur); - subtract(IErode, *IBlur, *L); - IErode.release(); - IDilate.release(); - //UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type()); - //Laplacian(*IBlur, *L, -1); - return L; -} - -UMat* ZeroCrossings(UMat* I, Mat SE) { - UMat PErode, PDilate; - //Mat P = (*I >= 0); - UMat P; - compare(*I, 0, P, CMP_GE); - UMat* Z = new UMat(I->rows, I->cols, I->type()); - - erode(P, PErode, SE); - dilate(P, PDilate, SE); - - - //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ); - subtract(PDilate, PErode, *Z); - P.release(); - PDilate.release(); - PErode.release(); - return Z; -} - -UMat* Gradient(UMat* I) { - - UMat dIx, dIy; - UMat* dI = new UMat(I->rows, I->cols, I->type()); - Sobel(*I, dIx, -1, 1, 0, 1); - Sobel(*I, dIy, -1, 0, 1, 1); - magnitude(dIx, dIy, *dI); - dIx.release(); - dIy.release(); - return dI; -} - -UMat* RejectZeroCrossings(UMat* dI, UMat* Z, float Threshold) { - double dI_max; - UMat* E = new UMat(Z->rows, Z->cols, Z->type()); - minMaxLoc(*dI, NULL, &dI_max); - //minMaxLoc(dI->getMat(ACCESS_READ), NULL, &dI_max); - UMat temp; - compare(*dI, Threshold*dI_max, temp, CMP_GT); - bitwise_and(*Z, temp, *E); - temp.release(); - //*E = Z->getMat(ACCESS_READ) & temp.getMat(ACCESS_READ); - return E; -} - -/* Edge Detect - * Returns edges of image I in binary matrix E - * Sn is the standard deviation of Gaussians needed for the filters - * theta is the threshold used to determine whether there is an edge - */ -UMat* EdgeDetect(UMat* I, float Sn, float Threshold, Mat B) { - - // Gaussian Smoothening - UMat* IBlur = GaussianSmoothening(I, Sn); - //cout << "Show GS\n"; - //imshow(window_name, *IBlur); - //waitKey(0); - - // Gradient Computation - UMat* dI = Gradient(I); - //cout << "Show G\n"; - //imshow(window_name, *dI); - //waitKey(0); - - I->release(); - // Non-Linear Laplacian Estimate - UMat* L = NonLinearLaplacian(IBlur, B); - //Mat* L = new Mat(IBlur->rows, IBlur->cols, IBlur->type()); - //Laplacian(*IBlur, *L, -1); - //cout << "Show L\n"; - //imshow(window_name, *L); - //waitKey(0); - IBlur->release(); - - // Find zero-crossings - UMat* Z = ZeroCrossings(L, B); - //cout << "Show ZC\n"; - //imshow(window_name, *Z); - //waitKey(0); - L->release(); - - // Reject Zero Crossings - UMat* E = RejectZeroCrossings(dI, Z, Threshold); - //cout << "Show RZC\n"; - //imshow(window_name, *E); - //waitKey(0); - Z->release(); - dI->release(); - return E; - //imshow(window_name, E); - //waitKey(0); -} - - -/** @function main */ -int main( int argc, char** argv ) -{ - cout << "Accelerating Using OpenCV" << CV_VERSION << "\n"; - ocl::setUseOpenCL(false); - cout << "OpenCL: " << ocl::useOpenCL() << "\n"; - /// Load an image - Mat src = imread( argv[1] ); - - if( !src.data ) - { - return -1; - } - - /// Convert the image to grayscale - cvtColor( src, src, CV_BGR2GRAY ); - - /// Create a window - namedWindow( window_name, CV_WINDOW_AUTOSIZE ); - - src.convertTo(src, CV_32F, 1.0/255.0); - Mat B = getStructuringElement(MORPH_CROSS, Size(3,3)); - //src.create(10000, 12000, CV_32FC1); - Timestamp start = get_time(TIMER_MT); - for(int i=0; i<NUM_FRAMES; i++) { - UMat srcUMat = src.getUMat(ACCESS_READ); - UMat* E = EdgeDetect(&srcUMat, 1.0, 0.1, B); - //imshow(window_name, *E); - //waitKey(0); - E->getMat(ACCESS_RW); - E->release(); - } - - Timestamp end = get_time(TIMER_MT); - cout << "Running time = " << (end-start * 1.0)/BILLION << " seconds\n"; - /// Wait until user exit program by pressing a key - //waitKey(0); - - return 0; -} diff --git a/hpvm/test/edge/PipelineEdgeDetect.cpp b/hpvm/test/edge/PipelineEdgeDetect.cpp deleted file mode 100644 index 309902196731c270f5cb02ae9c7fc1c91580735f..0000000000000000000000000000000000000000 --- a/hpvm/test/edge/PipelineEdgeDetect.cpp +++ /dev/null @@ -1,481 +0,0 @@ -#include <opencv2/opencv.hpp> -#include <opencv2/core/ocl.hpp> -#include <cstdlib> -#include <cstdio> -#include <math.h> -#include <iostream> -#include <condition_variable> -#include <mutex> -#include <thread> -#include <vector> -#include "time.h" - -#define NUM_FRAMES 200 -#ifdef DEBUG -#define DEBUG(X) X -#else -#define DEBUG(X) -#endif - -#define OPENCL false - -using namespace std; -using namespace cv; - -string window_name = "Edge Map"; - -// Circular Buffer class -template <class ElementType> -class CircularBuffer { -private: - int numElements; - int bufferSize; - int Head; - int Tail; - mutex mtx; - condition_variable cv; - vector<ElementType*> buffer; - string name; - -public: - CircularBuffer(int maxElements, string _name = "ANON_BUFFER") { - Head = 0; - Tail = 0; - numElements = 0; - bufferSize = maxElements+1; - name = _name; - buffer.reserve(bufferSize); - } - - bool push(ElementType* E, string caller); - ElementType* pop(string caller); - -}; - -template <class ElementType> -bool CircularBuffer<ElementType>::push(ElementType* E, string caller) { - unique_lock<mutex> lk(mtx); - if((Head +1) % bufferSize == Tail) { - cv.wait(lk); - } - buffer[Head] = E; - Head = (Head+1) % bufferSize; - numElements++; - DEBUG(cout << "[" << caller << "]: " << name << " Total Elements = " << numElements << "\n"); - lk.unlock(); - cv.notify_one(); - return true; -} - -template <class ElementType> -ElementType* CircularBuffer<ElementType>::pop(string caller) { - unique_lock<mutex> lk(mtx); - if(Tail == Head) { - //DEBUG(cout << "[C] Going to sleep ...\n"); - cv.wait(lk); - //DEBUG(cout << "[C] Waking up\n"); - } - ElementType* E = buffer[Tail]; - Tail = (Tail + 1) % bufferSize; - numElements--; - DEBUG(cout << "[" << caller << "]: " << name << " Total Elements = " << numElements << "\n"); - lk.unlock(); - cv.notify_one(); - return E; -} -// -------------------------------------------------------------------------- - - -// Read image from a file and convert it into gray sclae -Mat readImage(char* file) { - Mat src = imread( file ); - - if( !src.data ) - { - DEBUG(cout << "Error: Canot read input image " << file << "\n"); - exit(-1); - } - - /// Convert the image to grayscale - Mat src_gray; - cvtColor( src, src, CV_BGR2GRAY ); - src.convertTo(src, CV_32F, 1.0/255.0); - return src; -} - -// Gaussian Smoothening Node function -void GaussianSmoothening(float Sn, CircularBuffer<Mat>* in_I, CircularBuffer<Mat>* out_IBlur, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - DEBUG(cout << "Gaussian Smoothening Starts\n"); - string id = "GS"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - Mat* I = in_I->pop(id); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - int kernelSize = 2*ceil(3*Sn) + 1; - Mat* IBlur = new Mat(I->rows, I->cols, I->type()); - GaussianBlur(*I, *IBlur, Size(kernelSize, kernelSize), Sn); - out_IBlur->push(IBlur, id); - I->release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "Gaussian Smoothening Done\n"); -} - -// Laplacian Node -void NonLinearLaplacian(Mat B, CircularBuffer<Mat>* in_IBlur, CircularBuffer<Mat>* out_L, float* time, float* ctime) { - Timestamp start, end, t1, t2, get=0, copy =0, ed=0, allocation=0, calc=0, copyB=0, sub=0, push=0, release=0; - - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(true); - DEBUG(cout << "Non-linear Laplacian Starts\n"); - string id = "L"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - - // Waiting for data - //t1 = get_time(TIMER_MT); - Mat* IBlur = in_IBlur->pop(id); - //t2 = get_time(TIMER_MT); - //get += t2-t1; - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - - // Copy to GPU - UMat IErode, IDilate; - //t1 = get_time(TIMER_MT); - UMat IBlurU = IBlur->getUMat(ACCESS_READ); - //t2 = get_time(TIMER_MT); - //copy += t2-t1; - - // Erode dilate calculation - // Non-Linear Laplacian Estimate - //t1 = get_time(TIMER_MT); - erode(IBlurU, IErode, B); - dilate(IBlurU, IDilate, B); - //t2 = get_time(TIMER_MT); - //ed += t2 -t1; - - // Allocation of new L matrix - //UMat* LU = new UMat(IBlur->rows, IBlur->cols, IBlur->type()); - //t1 = get_time(TIMER_MT); - Mat* L = new Mat(IBlur->rows, IBlur->cols, IBlur->type()); - //t2 = get_time(TIMER_MT); - //allocation += t2-t1; - //*L = IErode.getMat(ACCESS_READ) + IDilate.getMat(ACCESS_READ) - (2*(IBlur->getMat(ACCESS_READ))); - - // Add calculation on GPU - //t1 = get_time(TIMER_MT); - add(IErode, IDilate, IErode); - add(IBlurU, IBlurU, IBlurU); - //t2 = get_time(TIMER_MT); - //calc += t2-t1; - //subtract(IErode, IBlurU, *LU); - - // Copy back may be?? - //t1 = get_time(TIMER_MT); - UMat LU = L->getUMat(ACCESS_WRITE); - //t2 = get_time(TIMER_MT); - //copyB += t2 - t1; - - // Sub on GPU - //t1 = get_time(TIMER_MT); - subtract(IErode, IBlurU, LU); - //t2 = get_time(TIMER_MT); - //sub += t2 - t1; - - //Mat* L = new Mat(LU->getMat(ACCESS_READ)); - //UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type()); - //Laplacian(*IBlur, *L, -1); - // Pushing on buffer - //t1 = get_time(TIMER_MT); - out_L->push(L, id); - //t2 = get_time(TIMER_MT); - //push += t2 - t1; - - // Freeing memory - //t1 = get_time(TIMER_MT); - IErode.release(); - IDilate.release(); - IBlur->release(); - IBlurU.release(); - //t2 = get_time(TIMER_MT); - release += t2 - t1; - //LU->release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - //*ctime = copy *1.0/BILLION; - //cout << "\t\t\tGET\t"<< get *1.0/BILLION << "\n"; - //cout << "\t\t\tCOPY\t"<< copy *1.0/BILLION << "\n"; - //cout << "\t\t\tED\t"<< ed *1.0/BILLION << "\n"; - //cout << "\t\t\tALLOC\t"<< allocation *1.0/BILLION << "\n"; - //cout << "\t\t\tCALC\t"<< calc *1.0/BILLION << "\n"; - //cout << "\t\t\tCOPY-B\t"<< copyB *1.0/BILLION << "\n"; - //cout << "\t\t\tSUB\t"<< sub *1.0/BILLION << "\n"; - //cout << "\t\t\tPUSH\t"<< push *1.0/BILLION << "\n"; - //cout << "\t\t\tFREE\t"<< release *1.0/BILLION << "\n"; - DEBUG(cout << "Non-linear Laplacian Done\n"); -} - -// Gradient Node -void Gradient(CircularBuffer<Mat>* in_I, CircularBuffer<Mat>* out_dI, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - DEBUG(cout << "Gradient Starts\n"); - string id = "G"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - Mat* I = in_I->pop(id); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - // Gradient Computation - //Mat dIx, dIy; - UMat dIx, dIy; - UMat IU = I->getUMat(ACCESS_READ); - Sobel(IU, dIx, -1, 1, 0, 1); - Sobel(IU, dIy, -1, 0, 1, 1); - Mat* dI = new Mat(I->rows, I->cols, I->type()); - magnitude(dIx, dIy, dI->getUMat(ACCESS_WRITE)); - out_dI->push(dI, id); - dIx.release(); - dIy.release(); - I->release(); - IU.release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "Gradient Done\n"); -} - -// Find Zero Crossings Node -void ZeroCrossings(Mat SE, CircularBuffer<Mat>* in_L, CircularBuffer<Mat>* out_Z, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - DEBUG(cout << "ZeroCrossings Start\n"); - string id = "ZC"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - Mat* L = in_L->pop(id); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - - Mat PErode, PDilate; - Mat P; - compare(*L, 0, P, CMP_GE); - //Mat P = (*L >= 0); - erode(P, PErode, SE); - dilate(P, PDilate, SE); - //Mat* Z = new Mat(L->rows, L->cols, L->type()); - //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ); - Mat* Z = new Mat(L->rows, L->cols, L->type()); - subtract(PDilate, PErode, *Z); - //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ); - out_Z->push(Z, id); - L->release(); - P.release(); - PDilate.release(); - PErode.release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "ZeroCrossings Done\n"); - -} - -// Reject Zero crossings Node -void RejectZeroCrossings(float Threshold, CircularBuffer<Mat>* in_dI, - CircularBuffer<Mat>* in_Z, CircularBuffer<Mat>* out_E, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - // Reject Zero Crossings - DEBUG(cout << "Reject Zero Crossings Starts\n"); - string id = "RZC"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - Mat* dI = in_dI->pop(id); - Mat* Z = in_Z->pop(id); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - double dI_max; - Mat* E = new Mat(Z->rows, Z->cols, Z->type()); - minMaxLoc(*dI, NULL, &dI_max); - - //*E = *Z & (dI->getMat(ACCESS_READ)> Threshold*dI_max); - Mat temp; - compare(*dI, Threshold*dI_max, temp, CMP_GT); - bitwise_and(*Z, temp, *E); - out_E->push(E, id); - dI->release(); - Z->release(); - temp.release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "Reject Zero Crossings Done\n"); -} - -// Producer thread, feeds in the same image NUM_FRAMES times to Gaussian and -// Gradient node -void producer_thread(Mat* I, CircularBuffer<Mat>* out_I_Gaussian, CircularBuffer<Mat>* out_I_Gradient, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - DEBUG(cout << "Producer Start\n"); - string id = "P"; - int i = 0; - //Mat UI = I->getMat(ACCESS_READ); - while(i < NUM_FRAMES) { - //waitKey(0); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - Mat* I1 = new Mat(I->rows, I->cols, I->type()); - Mat* I2 = new Mat(I->rows, I->cols, I->type()); - I->copyTo(*I1); - I->copyTo(*I2); - out_I_Gaussian->push(I1, id); - out_I_Gradient->push(I2, id); - i++; - } - - //UI.release(); - I->release(); - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "Producer Done!\n"); -} - -// Consumer Node for displaying the image -void consumer_thread(CircularBuffer<Mat>* in_E, float* time) { - Timestamp start, end; - start = get_time(TIMER_THREAD); - ocl::setUseOpenCL(OPENCL); - DEBUG(cout << "Consumer Start\n"); - string id = "C"; - int i = 0; - while(i < NUM_FRAMES) { - //waitKey(0); - Mat* E = in_E->pop(id); - //E->getMat(ACCESS_RW); - DEBUG(cout << "[" << id << "]: Iteration " << i << "\n"); - //imshow(window_name, *E); - //waitKey(0); - E->release(); - i++; - } - end = get_time(TIMER_THREAD); - *time = (end-start)*1.0/BILLION; - DEBUG(cout << "Consumer Done!\n"); -} - - -int main(int argc, char* argv[]) { - int bufferSize = stoi(argv[1]); - int iterations = 1; - if(argc == 4) - iterations = stoi(argv[3]); - float Threshold = 0.1; - float Sn = 1.0; - - DEBUG(cout << "Accelerating Using OpenCV" << CV_VERSION << "\n"); - ocl::setUseOpenCL(OPENCL); - cout << "Use OpenCL: " << ocl::useOpenCL() << "\n"; - - /// Load an image - Timestamp start, end; - //start = get_time(TIMER_MT); - //end = get_time(TIMER_MT); - //cout << "Running time = " << (end-start * 1.0)/BILLION << " seconds\n"; - Mat B = getStructuringElement(MORPH_CROSS, Size(3,3)); - - namedWindow( window_name, CV_WINDOW_AUTOSIZE ); - DEBUG(cout << "Main Starts\n"); - - float tGS, tG, tL, tRZC, tZC, tP, tC, tMain, tCopy; - float aggtGS=0, aggtG=0, aggtL=0, aggtRZC=0, aggtZC=0, aggtP=0, aggtC=0, aggtMain=0; - for(int i = 0; i<iterations+1; i++) { - Mat src1 = readImage(argv[2]); - //cout << "Image Size = " << src1.rows << ", " << src1.cols << "\n"; - start = get_time(TIMER_MT); - // Producer - CircularBuffer<Mat> Producer_Gaussian_I(bufferSize, "Producer-->Gaussian-I"); - CircularBuffer<Mat> Producer_Gradient_I(bufferSize, "Producer-->Gradient-I"); - thread producer(producer_thread, &src1, &Producer_Gaussian_I, &Producer_Gradient_I, &tP); - - // Gaussian Smoothening Node - CircularBuffer<Mat> Gaussian_Laplacian_IBlur(bufferSize, "Gaussian-->Laplacian-IBlur"); - thread GaussianNode(GaussianSmoothening, Sn, &Producer_Gaussian_I, &Gaussian_Laplacian_IBlur, &tGS); - - // Gradient Node - CircularBuffer<Mat> Gradient_Reject_dI(bufferSize, "Gradient-->Reject-dI"); - thread GradientNode(Gradient, &Producer_Gradient_I, &Gradient_Reject_dI, &tG); - - // Laplacian Node - CircularBuffer<Mat> Laplacian_Zero_L(bufferSize, "Laplacian-->Zero-L"); - thread LaplacianNode(NonLinearLaplacian, B, &Gaussian_Laplacian_IBlur, &Laplacian_Zero_L, &tL, &tCopy); - - // Zero Crossings Node - CircularBuffer<Mat> Zero_Reject_Z(bufferSize, "Zero-->Reject-Z"); - thread ZeroCrossingsNode(ZeroCrossings, B, &Laplacian_Zero_L, &Zero_Reject_Z, &tZC); - - // Reject Zero Crossings Node - CircularBuffer<Mat> Reject_Consumer_E(bufferSize, "Reject-->Consumer-E"); - thread RejectZeroNode(RejectZeroCrossings, Threshold, &Gradient_Reject_dI, &Zero_Reject_Z, &Reject_Consumer_E, &tRZC); - - // Consumer - thread consumer(consumer_thread, &Reject_Consumer_E, &tC); - - // Wait for threads to finish - producer.join(); - GaussianNode.join(); - GradientNode.join(); - LaplacianNode.join(); - ZeroCrossingsNode.join(); - RejectZeroNode.join(); - consumer.join(); - - end = get_time(TIMER_MT); - tMain = (end-start*1.0)/BILLION; - // Skip first iteration numbers to avoid first iteration which sometimes - // get scheduled entirely on CPU - if(i > 0) { - aggtMain += tMain; - aggtGS += tGS; - aggtG += tG; - aggtL += tL; - aggtZC += tZC; - aggtRZC += tRZC; - aggtP += tP; - aggtC += tC; - } - //cout << i << ":\t(Main)\t" << tMain << "\n"; - //cout << i << ":\t\t(L)\t" << tL << "\n"; - //cout << i << ":\t\t\t(Copy)\t"<< tCopy << "\n"; - //cout << i << ":\t\t(G)\t" << tG << "\n"; - //cout << i << ":\t\t(GS)\t" << tGS << "\n"; - //cout << i << ":\t\t(ZC)\t" << tZC << "\n"; - } - cout << "Main Running time = " << aggtMain/iterations << " seconds\n"; - cout << "GS Running time = " << aggtGS/iterations << " seconds\n"; - cout << "G Running time = " << aggtG/iterations << " seconds\n"; - cout << "L Running time = " << aggtL/iterations << " seconds\n"; - cout << "ZC Running time = " << aggtZC/iterations << " seconds\n"; - cout << "RZC Running time = " << aggtRZC/iterations << " seconds\n"; - cout << "P Running time = " << aggtP/iterations << " seconds\n"; - cout << "C Running time = " << aggtC/iterations << " seconds\n"; - - cout << "Total Running time = " << (aggtGS+aggtG+aggtL+aggtZC+aggtRZC+aggtP+aggtC)/iterations << "\n"; - /// Create a window - return 0; -} diff --git a/hpvm/test/edge/edgetest_10.png b/hpvm/test/edge/edgetest_10.png deleted file mode 100644 index 2632a72305ed0bb69995ff1af2bd2fe1c2bdbc46..0000000000000000000000000000000000000000 Binary files a/hpvm/test/edge/edgetest_10.png and /dev/null differ diff --git a/hpvm/test/edge/house.png b/hpvm/test/edge/house.png deleted file mode 100644 index 6e38af9b6c77a21e23967b24444ad9262a3e6047..0000000000000000000000000000000000000000 Binary files a/hpvm/test/edge/house.png and /dev/null differ diff --git a/hpvm/test/edge/time.h b/hpvm/test/edge/time.h deleted file mode 100644 index 53b547e8d8c120e67dc8c30e1094e40d7a40a722..0000000000000000000000000000000000000000 --- a/hpvm/test/edge/time.h +++ /dev/null @@ -1,17 +0,0 @@ -#include <ctime> - -typedef unsigned long long Timestamp; /* time in microseconds */ -#define BILLION 1000000000LL - -#define TIMER_RT CLOCK_REALTIME -#define TIMER_MT CLOCK_MONOTONIC -#define TIMER_PROCESS CLOCK_PROCESS_CPUTIME_ID -#define TIMER_THREAD CLOCK_THREAD_CPUTIME_ID - -static Timestamp get_time(clockid_t timer) -{ - struct timespec tv; - clock_gettime(timer, &tv); - return (Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); -} - diff --git a/hpvm/test/gemm_opencl/matrixMul/Makefile b/hpvm/test/gemm_opencl/matrixMul/Makefile deleted file mode 100644 index eb97b153334f5886a08780ef7b9a8ebbc8d05e7c..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -PASSES := - -.PHONY: clean - -LLVM_INSTALL:=/home/psrivas2/visc/llvm-install -LIBCLC:=/home/psrivas2/visc/libclc -HOST:=gemm_opencl -KERNELS:=kernel -LLVM_CC:=$(LLVM_INSTALL)/bin/clang -LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link - -all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin) - -$(KERNELS:%=%.ll):%.ll:%.cl - $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ - -$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ - -$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - $(LLVM_CC) -O3 -target nvptx $< -S -o $@ - -$(HOST:%=%.ll):%.ll:%.c - $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@ - -$(HOST:%=%.bin):%.bin:%.c - $(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@ - -clean : - rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin diff --git a/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c deleted file mode 100644 index 92ee91ed7d718aa677cd598fc37586c037cd253c..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c +++ /dev/null @@ -1,323 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <string.h> -#include <CL/cl.h> - -#define WA 1024 -#define HA 1024 -#define WB 1024 -#define HB WA -#define WC WB -#define HC HA - - - -// Thread block size -#define BLOCK_SIZE 16 - -static inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - fprintf(stderr, "ERROR: %s\nErrorcode: %d\n", name, err); - exit(EXIT_FAILURE); - } -} - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -////////////////////////////////////////////////////////////////////////////// -//! Loads a Program file. -//! -//! @return the source string if succeeded, 0 otherwise -//! @param cFilename program filename -//! @param szFinalLength returned length of the code string -////////////////////////////////////////////////////////////////////////////// -char* LoadProgSource(const char* cFilename, size_t* szFinalLength) -{ - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - #ifdef _WIN32 // Windows version - if(fopen_s(&pFileStream, cFilename, "rb") != 0) - { - return NULL; - } - #else // Linux version - pFileStream = fopen(cFilename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - #endif - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string - fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength; - } - cSourceString[szSourceLength] = '\0'; - - return cSourceString; -} - -// Check bool -int isEqual(float a, float b) { - return (fabs(a-b) < 0.001); -} - -// Check Results - -int checkResults(float* A, float* B, float* C) { - unsigned int size_A = WA * HA; - unsigned int size_B = WB * HB; - unsigned int size_C = WC * HC; - unsigned int bytesC = sizeof(float) * size_C; - float* goldC = (float*) malloc(bytesC); - for (int i=0; i < HC; i++) { - for (int j=0; j < WC; j++) { - goldC[i*WC + j] = 0; - for (int k=0; k < HB; k++) { - goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; - } - if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { - printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); - return 0; - } - } - } - return 1; // Success -} - -// GPU Computation of MatrixMul -void computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C) { - // OpenCL specific variables - cl_context clGPUContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; - - size_t dataBytes; - size_t kernelLength; - cl_int errcode; - - // OpenCL device memory for matrices - cl_mem d_A; - cl_mem d_B; - cl_mem d_C; - - /*****************************************/ - /* Initialize OpenCL */ - /*****************************************/ - // query the number of platforms - cl_uint numPlatforms; - errcode = clGetPlatformIDs(0, NULL, &numPlatforms); - checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); - - // now get all the platform IDs - cl_platform_id platforms[numPlatforms]; - errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); - - for(unsigned i=0; i < numPlatforms; i++) { - char buffer[10240]; - //printf(" -- %d --\n", i); - clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); - //printf(" PROFILE = %s\n", buffer); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); - //printf(" VERSION = %s\n", buffer); - clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); - //printf(" NAME = %s\n", buffer); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); - //printf(" VENDOR = %s\n", buffer); - clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); - //printf(" EXTENSIONS = %s\n", buffer); - } - // set platform property - just pick the first one - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - (int) platforms[0], - 0}; - clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, - NULL, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create GPU context"); - - // get the list of GPU devices associated with context - errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); - cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - - //Create a command-queue - clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); - - // Setup device memory - d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, - &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_B, h_B, &errcode); - - - // Load and build OpenCL kernel - /*char *clMatrixMul = LoadProgSource("matrixMul.cl", - "// My comment\n", - &kernelLength);*/ - //checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program"); - - /*clProgram = clCreateProgramWithSource(clGPUContext, 1, - (const char **)&clMatrixMul, - &kernelLength, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create program from source"); -*/ - size_t binaryLength; - char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &binaryLength); - checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary"); - - cl_int binaryStatus; - clProgram = clCreateProgramWithBinary(clGPUContext, 1, &clDevices[0], - &binaryLength, - (const unsigned char **)&clMatrixMul, - &binaryStatus, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); - - errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to build program"); - - clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); - - - // Launch OpenCL kernel - size_t localWorkSize[2], globalWorkSize[2]; - - int wA = WA; - int wC = WC; - errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); - errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); - errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); - errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); - errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); - checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments"); - - localWorkSize[0] = BLOCK_SIZE; - localWorkSize[1] = BLOCK_SIZE; - globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - - errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); - - // Retrieve result from device - errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, - h_C, 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to read buffer"); - - // Print out the result -/* - printf("\n\nMatrix C (Result)\n"); - for(int i = 0; i < size_C; i++) { - printf("%f ", h_C[i]); - if(((i + 1) % WC) == 0) - printf("\n"); - } - printf("\n"); - */ - // Deallocate memory - - clReleaseMemObject(d_A); - clReleaseMemObject(d_C); - clReleaseMemObject(d_B); - - free(clDevices); - free(clMatrixMul); - clReleaseContext(clGPUContext); - clReleaseKernel(clKernel); - clReleaseProgram(clProgram); - clReleaseCommandQueue(clCommandQue); - -} - -// Main -int main(int argc, char** argv) { - - // seed for rand() - srand(2006); - - // Allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); - - unsigned int size_B = WB * HB; - unsigned int bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ - - // Allocate host memory for the result matrix C - unsigned int size_C = WC * HC; - unsigned int bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); - - // Compute using OpenCL - computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C); - - if(checkResults(h_A, h_B, h_C)) - printf("\nPass!\n"); - else - printf("\nFailed!\n"); - printf("\nDone!\n"); - - // Deallocate memory - free(h_A); - free(h_B); - free(h_C); -} - diff --git a/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl b/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl deleted file mode 100644 index 7ca1d3e347e30b236bcd935cc06f5cfead1b68f0..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl +++ /dev/null @@ -1,17 +0,0 @@ -// OpenCL Kernel for matrix multiply, C = A * B -__kernel void _Z9mysgemmNTPfS_(__global float* A, - size_t bytesA, - __global float* B, - size_t bytesB - ) { - - int tx = get_global_id(0); //2D Global Thread ID x - // Initialize accumulator - - // Perform dot-product of row-column - // Write in device memory - B[tx] = (A[tx] <= B[tx])? B[tx]:A[tx]+B[tx]; - /*B[tx] = A[tx];*/ - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul/sgemm.c b/hpvm/test/gemm_opencl/matrixMul/sgemm.c deleted file mode 100644 index c1c3a300668b94f904393074bb92874715ac5e25..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul/sgemm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -void matrixMultiply(float *C, float *A, float *B, int k, int n); - -/* -// Host matrix multiply -void matrixMulHost (int m, k, n, int *A, int *B, int *C) { - - for (int i = 0; i < m; i++) - for (int j = 0; j < n; j++) - for (int t = 0; t < k; t++) - C[i*n + j] = A[i*k + t] + B[t*n + j]; - - return; -} -*/ - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -int main (int argc, char *argv[]) { - int m = atoi(argv[1]); - int k = atoi(argv[2]); - int n = atoi(argv[3]); - - float *A, *B, *C; - - /******************************************************************** - Allocate memory and initialize the input/output vectors - ********************************************************************/ - - A = (float *) malloc(m*k*sizeof(float)); - B = (float *) malloc(k*n*sizeof(float)); - C = (float *) malloc(m*n*sizeof(float)); - - randomInit(A, m*k); - randomInit(B, k*n); - - matrixMultiply(C, A, B, k, n); - - /******************************************************************** - Free memory allocations - ********************************************************************/ - - free(A); free(B); free(C); - - return 0; -} - diff --git a/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll deleted file mode 100644 index fe287e55ac68f3677e6a2eb528657f2b4e791672..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll +++ /dev/null @@ -1,448 +0,0 @@ -; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s -; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll -; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin -; RUN: %t.bin -; ModuleID = 'gemm_opencl.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@.str1 = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 -@str = private unnamed_addr constant [9 x i8] c"\0AFailed!\00" -@str26 = private unnamed_addr constant [7 x i8] c"\0ADone!\00" -@str27 = private unnamed_addr constant [7 x i8] c"\0APass!\00" - -; Function Attrs: nounwind uwtable -define void @randomInit(float* nocapture %data, i32 %size) #0 { -entry: - %cmp3 = icmp sgt i32 %size, 0 - br i1 %cmp3, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %call = tail call i32 @rand() #4 - %conv = sitofp i32 %call to float - %div = fmul float %conv, 0x3E00000000000000 - %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv - store float %div, float* %arrayidx, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %size - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -; Function Attrs: nounwind readnone uwtable -define i32 @isEqual(float %a, float %b) #2 { -entry: - %sub = fsub float %a, %b - %fabsf = tail call float @fabsf(float %sub) #6 - %0 = fpext float %fabsf to double - %cmp = fcmp olt double %0, 1.000000e-03 - %conv1 = zext i1 %cmp to i32 - ret i32 %conv1 -} - -; Function Attrs: nounwind uwtable -define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #0 { -entry: - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry, %for.inc50 - %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ] - %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ] - %0 = shl nsw i64 %indvars.iv92, 10 - br label %for.body7 - -for.cond4: ; preds = %for.end - %inc48 = add nsw i32 %j.079, 1 - %1 = trunc i64 %indvars.iv.next89 to i32 - %cmp5 = icmp slt i32 %1, 1024 - br i1 %cmp5, label %for.body7, label %for.inc50 - -for.body7: ; preds = %for.cond4.preheader, %for.cond4 - %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ] - %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ] - %2 = add nsw i64 %indvars.iv88, %0 - br label %for.body12 - -for.body12: ; preds = %for.body12, %for.body7 - %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ] - %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ] - %4 = add nsw i64 %indvars.iv, %0 - %arrayidx16 = getelementptr inbounds float* %A, i64 %4 - %5 = load float* %arrayidx16, align 4, !tbaa !0 - %6 = shl i64 %indvars.iv, 10 - %7 = add nsw i64 %6, %indvars.iv88 - %arrayidx20 = getelementptr inbounds float* %B, i64 %7 - %8 = load float* %arrayidx20, align 4, !tbaa !0 - %mul21 = fmul float %5, %8 - %add26 = fadd float %3, %mul21 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end, label %for.body12 - -for.end: ; preds = %for.body12 - %arrayidx34 = getelementptr inbounds float* %C, i64 %2 - %9 = load float* %arrayidx34, align 4, !tbaa !0 - %sub.i = fsub float %add26, %9 - %fabsf.i = tail call float @fabsf(float %sub.i) #6 - %10 = fpext float %fabsf.i to double - %cmp.i = fcmp olt double %10, 1.000000e-03 - %indvars.iv.next89 = add i64 %indvars.iv88, 1 - br i1 %cmp.i, label %for.cond4, label %if.then - -if.then: ; preds = %for.end - %conv40 = fpext float %9 to double - %conv45 = fpext float %add26 to double - %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #4 - br label %return - -for.inc50: ; preds = %for.cond4 - %indvars.iv.next93 = add i64 %indvars.iv92, 1 - %inc51 = add nsw i32 %i.081, 1 - %11 = trunc i64 %indvars.iv.next93 to i32 - %cmp = icmp slt i32 %11, 1024 - br i1 %cmp, label %for.cond4.preheader, label %return - -return: ; preds = %for.inc50, %if.then - %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ] - ret i32 %retval.0 -} - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #1 - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #4 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #4 - -; --------------- VISC Intrinsics --------------- -; Return Type of VISC Compute Matrix Mul -%rtype = type {float*, i32} -%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype } - -; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode1D(i8*, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getNode() #0 - -; Function Attrs: nounwind -declare i8* @llvm.visc.getParentNode(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNumDims(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0 - -; Function Attrs: nounwind -declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0 - -; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32) - -; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32) -; ----------------- VISC intrinsics end ------------------ - -@.strce = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 -@stref = private unnamed_addr constant [17 x i8] c"Entered function\00" -@strrc = private unnamed_addr constant [16 x i8] c"Result computed\00" -@strrw = private unnamed_addr constant [20 x i8] c"Result written to C\00" -@stroa = private unnamed_addr constant [17 x i8] c"Output allocated\00" - -; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { -entry: - %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @stref, i64 0, i64 0)) - - ; ------------------------- VISC changes ------------------ - ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x - ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3 - %this_node = call i8* @llvm.visc.getNode() - %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) - - ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x - ; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3 - %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node) - ; ---------------------- VISC changes End ------------------ - %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3 - %cmp32 = icmp sgt i32 %k, 0 - br i1 %cmp32, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry - %mul = mul nsw i32 %call2, %k - %0 = sext i32 %mul to i64 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ] - ;%calln = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i64 %indvars.iv, i32 %call2) #6 - %1 = add nsw i64 %indvars.iv, %0 - %arrayidx = getelementptr inbounds float* %A, i64 %1 - %2 = load float* %arrayidx, align 4, !tbaa !0 - %3 = trunc i64 %indvars.iv to i32 - %mul4 = mul nsw i32 %3, %n - %add5 = add nsw i32 %mul4, %call1 - %idxprom6 = sext i32 %add5 to i64 - %arrayidx7 = getelementptr inbounds float* %B, i64 %idxprom6 - %4 = load float* %arrayidx7, align 4, !tbaa !0 - %mul8 = fmul float %2, %4 - %add9 = fadd float %res.034, %mul8 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %k - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add9, %for.body ] - %puts29 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @strrc, i64 0, i64 0)) - %mul11 = mul nsw i32 %call2, %n - %add12 = add nsw i32 %mul11, %call1 - %idxprom13 = sext i32 %add12 to i64 - %arrayidx14 = getelementptr inbounds float* %C, i64 %idxprom13 - store float %res.0.lcssa, float* %arrayidx14, align 4, !tbaa !0 - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @strrw, i64 0, i64 0)) - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @stroa, i64 0, i64 0)) - %.fca.0.insert = insertvalue %rtype undef, float* %C, 0 - %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1 - ret %rtype %.fca.1.insert -} - -define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) - ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C - call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k - call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n - call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m - ; Bind Outputs - call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C - call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C - ret %rtype zeroinitializer -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -entry: - tail call void @srand(i32 2006) #4 - %call = tail call noalias i8* @malloc(i64 4194304) #4 - %0 = bitcast i8* %call to float* - %call7 = tail call noalias i8* @malloc(i64 4194304) #4 - br label %for.body.i - -for.body.i: ; preds = %for.body.i, %entry - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ] - %call.i = tail call i32 @rand() #4 - %conv.i = sitofp i32 %call.i to float - %div.i = fmul float %conv.i, 0x3E00000000000000 - %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i - store float %div.i, float* %arrayidx.i, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv59 = trunc i64 %indvars.iv.next.i to i32 - %exitcond60 = icmp eq i32 %lftr.wideiv59, 1048576 - br i1 %exitcond60, label %for.body.i40.preheader, label %for.body.i - -for.body.i40.preheader: ; preds = %for.body.i - %1 = bitcast i8* %call7 to float* - br label %for.body.i40 - -for.body.i40: ; preds = %for.body.i40.preheader, %for.body.i40 - %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ] - %call.i33 = tail call i32 @rand() #4 - %conv.i34 = sitofp i32 %call.i33 to float - %div.i35 = fmul float %conv.i34, 0x3E00000000000000 - %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32 - store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0 - %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1 - %lftr.wideiv57 = trunc i64 %indvars.iv.next.i37 to i32 - %exitcond58 = icmp eq i32 %lftr.wideiv57, 1048576 - br i1 %exitcond58, label %randomInit.exit41, label %for.body.i40 - -randomInit.exit41: ; preds = %for.body.i40 - %call12 = tail call noalias i8* @malloc(i64 4194304) #4 - %2 = bitcast i8* %call12 to float* - - ; ---------------------------------- Adding VISC Launch Call -------------------------------- - ; Setting up launch input args - %in.addr = alloca %struct.arg - - ; Store arguments - %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0 - %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 - %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 - %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 - %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 - %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 - %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 - %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 - %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 - - store float* %0, float** %in.addr.h_A - store i32 4194304, i32* %in.addr.bytes_A - store float* %1, float** %in.addr.h_B - store i32 4194304, i32* %in.addr.bytes_B - store float* %2, float** %in.addr.h_C - store i32 4194304, i32* %in.addr.bytes_C - store i32 1024, i32* %in.addr.WA - store i32 1024, i32* %in.addr.WB - store i32 1024, i32* %in.addr.HA - - ; Change type to i8* and VISC Launch call - %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) - ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304) - - ; Wait for result - call void @llvm.visc.wait(i8* %graphID) - - ; Get the result - %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9 - %out = load %rtype* %out.addr - %out.h_C = extractvalue %rtype %out, 0 - ;%2 = extractvalue %rtype %out, 0 - %out.bytes_C = extractvalue %rtype %out, 1 - - ; -------------------------------- Completed VISC Launch Call -------------------------------- - - br label %for.cond4.preheader.i - -for.cond4.preheader.i: ; preds = %for.inc50.i, %randomInit.exit41 - %indvars.iv92.i = phi i64 [ 0, %randomInit.exit41 ], [ %indvars.iv.next93.i, %for.inc50.i ] - %i.081.i = phi i32 [ 0, %randomInit.exit41 ], [ %inc51.i, %for.inc50.i ] - %3 = shl nsw i64 %indvars.iv92.i, 10 - br label %for.body7.i - -for.cond4.i: ; preds = %for.end.i - %inc48.i = add nsw i32 %j.079.i, 1 - %4 = trunc i64 %indvars.iv.next89.i to i32 - %cmp5.i = icmp slt i32 %4, 1024 - br i1 %cmp5.i, label %for.body7.i, label %for.inc50.i - -for.body7.i: ; preds = %for.cond4.i, %for.cond4.preheader.i - %indvars.iv88.i = phi i64 [ 0, %for.cond4.preheader.i ], [ %indvars.iv.next89.i, %for.cond4.i ] - %j.079.i = phi i32 [ 0, %for.cond4.preheader.i ], [ %inc48.i, %for.cond4.i ] - br label %for.body12.i - -for.body12.i: ; preds = %for.body12.i, %for.body7.i - %indvars.iv.i42 = phi i64 [ 0, %for.body7.i ], [ %indvars.iv.next.i43, %for.body12.i ] - %5 = phi float [ 0.000000e+00, %for.body7.i ], [ %add26.i, %for.body12.i ] - %6 = add nsw i64 %indvars.iv.i42, %3 - %arrayidx16.i = getelementptr inbounds float* %0, i64 %6 - %7 = load float* %arrayidx16.i, align 4, !tbaa !0 - %8 = shl i64 %indvars.iv.i42, 10 - %9 = add nsw i64 %8, %indvars.iv88.i - %arrayidx20.i = getelementptr inbounds float* %1, i64 %9 - %10 = load float* %arrayidx20.i, align 4, !tbaa !0 - %mul21.i = fmul float %7, %10 - %add26.i = fadd float %5, %mul21.i - %indvars.iv.next.i43 = add i64 %indvars.iv.i42, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next.i43 to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.end.i, label %for.body12.i - -for.end.i: ; preds = %for.body12.i - %11 = add nsw i64 %indvars.iv88.i, %3 - ; Replace use of %2 with %out.h_C - ;%arrayidx34.i = getelementptr inbounds float* %2, i64 %11 - %arrayidx34.i = getelementptr inbounds float* %out.h_C, i64 %11 - %12 = load float* %arrayidx34.i, align 4, !tbaa !0 - %sub.i.i = fsub float %add26.i, %12 - %fabsf.i.i = tail call float @fabsf(float %sub.i.i) #6 - %13 = fpext float %fabsf.i.i to double - %cmp.i.i = fcmp olt double %13, 1.000000e-03 - %indvars.iv.next89.i = add i64 %indvars.iv88.i, 1 - br i1 %cmp.i.i, label %for.cond4.i, label %if.else - -for.inc50.i: ; preds = %for.cond4.i - %indvars.iv.next93.i = add i64 %indvars.iv92.i, 1 - %inc51.i = add nsw i32 %i.081.i, 1 - %14 = trunc i64 %indvars.iv.next93.i to i32 - %cmp.i = icmp slt i32 %14, 1024 - br i1 %cmp.i, label %for.cond4.preheader.i, label %if.then - -if.then: ; preds = %for.inc50.i - %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str27, i64 0, i64 0)) - br label %if.end - -if.else: ; preds = %for.end.i - %conv40.i = fpext float %12 to double - %conv45.i = fpext float %add26.i to double - %call46.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081.i, i32 %j.079.i, double %conv40.i, double %conv45.i) #4 - %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str, i64 0, i64 0)) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str26, i64 0, i64 0)) - tail call void @free(i8* %call) #4 - tail call void @free(i8* %call7) #4 - tail call void @free(i8* %call12) #4 - ret i32 0 -} - - -; Function Attrs: nounwind -declare void @srand(i32) #1 - -; Function Attrs: noreturn nounwind -declare void @exit(i32) #5 - -declare float @fabsf(float) - -; Function Attrs: nounwind -declare i32 @puts(i8* nocapture) #4 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } -attributes #5 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #7 = { noreturn nounwind } - -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{metadata !"long", metadata !1} -!4 = metadata !{metadata !"int", metadata !1} -!5 = metadata !{metadata !"any pointer", metadata !1} diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/Makefile b/hpvm/test/gemm_opencl/matrixMul_bc/Makefile deleted file mode 100644 index 1984e14c78f3fabd3f2b92a98b81919dbaf2b979..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_bc/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -PASSES := - -.PHONY: clean - -LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install -LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc -HOST:=sgemm -KERNELS:=matrixMul_bc -LLVM_CC:=$(LLVM_INSTALL)/bin/clang -LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link - -all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) - -$(KERNELS:%=%.ll):%.ll:%.cl - $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ - -$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ - -$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - $(LLVM_CC) -target nvptx $< -S -o $@ - -$(HOST:%=%.ll):%.ll:%.c - $(LLVM_CC) -O3 -S -emit-llvm $< -o $@ - -clean : - rm -f *.ll *.bc *.s diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c deleted file mode 100644 index 31cd7502ea360592ea845c9705de2568f35d6de9..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c +++ /dev/null @@ -1,192 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -//#include <oclUtils.h> -#include <CL/cl.h> - -#define WA 1024 -#define HA 1024 -#define WB 1024 -#define HB WA -#define WC WB -#define HC HA - -// Thread block size -#define BLOCK_SIZE 16 - -inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - fprintf(stderr, "ERROR: %s\n", name); - exit(EXIT_FAILURE); - } -} - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -// Main -int main(int argc, char** argv) { - - // seed for rand() - srand(2006); - - // Allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); - - unsigned int size_B = WB * HB; - unsigned int bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ - - // Allocate host memory for the result matrix C - unsigned int size_C = WC * HC; - unsigned int bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); - - // Initialize OpenCL - - // OpenCL specific variables - cl_context clGPUContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; - - size_t dataBytes; - size_t kernelLength; - cl_int errcode; - - // OpenCL device memory for matrices - cl_mem d_A; - cl_mem d_B; - cl_mem d_C; - - /*****************************************/ - /* Initialize OpenCL */ - /*****************************************/ - clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, - NULL, NULL, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // get the list of GPU devices associated with context - errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); - cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - shrCheckError(errcode, CL_SUCCESS); - - //Create a command-queue - clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // Setup device memory - d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, - &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_B, h_B, &errcode); - - - // Load and build OpenCL kernel - char *clMatrixMul = oclLoadProgSource("kernel.cl", - "// My comment\n", - &kernelLength); - shrCheckError(clMatrixMul != NULL, CL_SUCCESS); - - clProgram = clCreateProgramWithSource(clGPUContext, 1, - (const char **)&clMatrixMul, - &kernelLength, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); - shrCheckError(errcode, CL_SUCCESS); - - - // Launch OpenCL kernel - size_t localWorkSize[2], globalWorkSize[2]; - - int wA = WA; - int wC = WC; - errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); - errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); - errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); - errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); - errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); - shrCheckError(errcode, CL_SUCCESS); - - localWorkSize[0] = BLOCK_SIZE; - localWorkSize[1] = BLOCK_SIZE; - globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - - errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Retrieve result from device - errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, - h_C, 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Print out the result -/* - printf("\n\nMatrix C (Result)\n"); - for(int i = 0; i < size_C; i++) { - printf("%f ", h_C[i]); - if(((i + 1) % WC) == 0) - printf("\n"); - } - printf("\n"); - */ - printf("\nDone!\n"); - - // Deallocate memory - free(h_A); - free(h_B); - free(h_C); - - clReleaseMemObject(d_A); - clReleaseMemObject(d_C); - clReleaseMemObject(d_B); - - free(clDevices); - free(clMatrixMul); - clReleaseContext(clGPUContext); - clReleaseKernel(clKernel); - clReleaseProgram(clProgram); - clReleaseCommandQueue(clCommandQue); - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl b/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl deleted file mode 100644 index 64c52ff5b8d5afd7c35a73cfca3a6587565810cd..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl +++ /dev/null @@ -1,30 +0,0 @@ -// Thread block size -#define TILE_SIZE 16 - -// OpenCL Kernel for matrix multiply, C = A * B -// with boundary conditions -__kernel void matrixMul(__global float* C, - __global float* A, - __global float* B, - int m, - int k, - int n) { - - int tx = get_global_id(0); //2D Global Thread ID x - int ty = get_global_id(1); //2D Global Thread ID y - - if ((tx < n) && (ty < m)) { - // Initialize accumulator - float res = 0.0f; - - // Perform dot-product of row-column - for (int i = 0; i < k; i++) { - res += A[ty*k+i] * B[i*n+tx]; - } - - // Write in device memory - C[ty*n+tx] = res; - } - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c deleted file mode 100644 index 1cf29a212c7eddab4f0e7c82da832f1a3f589e6a..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -void matrixMultiply(float *C, float *A, float *B, int m, int k, int n); - -/* -// Host matrix multiply -void matrixMulHost (int m, k, n, int *A, int *B, int *C) { - - for (int i = 0; i < m; i++) - for (int j = 0; j < n; j++) - for (int t = 0; t < k; t++) - C[i*n + j] = A[i*k + t] + B[t*n + j]; - - return; -} -*/ - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -int main (int argc, char *argv[]) { - int m = atoi(argv[1]); - int k = atoi(argv[2]); - int n = atoi(argv[3]); - - float *A, *B, *C; - - /******************************************************************** - Allocate memory and initialize the input/output vectors - ********************************************************************/ - - A = (float *) malloc(m*k*sizeof(float)); - B = (float *) malloc(k*n*sizeof(float)); - C = (float *) malloc(m*n*sizeof(float)); - - randomInit(A, m*k); - randomInit(B, k*n); - - matrixMultiply(C, A, B, m, k, n); - - /******************************************************************** - Free memory allocations - ********************************************************************/ - - free(A); free(B); free(C); - - return 0; -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/Makefile b/hpvm/test/gemm_opencl/matrixMul_sh/Makefile deleted file mode 100644 index 109208976556eaa52a56eb733cee3cfd13e245eb..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -PASSES := - -.PHONY: clean - -LIBCLC:=/home/kotsifa2/llvm/libclc -HOST:=sgemm -KERNELS:=matrixMul_sh - -all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) - -$(KERNELS:%=%.ll):%.ll:%.cl - clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ - -$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ - -$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - clang -target nvptx $< -S -o $@ - -$(HOST:%=%.ll):%.ll:%.c - clang -O3 -S -emit-llvm $< -o $@ - -clean : - rm -f *.ll *.bc *.s diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c deleted file mode 100644 index 2c41a20814f82473802e1efc54e82dbad59ee02d..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c +++ /dev/null @@ -1,193 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <oclUtils.h> - -#define WA 1024 -#define HA 1024 -#define WB 1024 -#define HB WA -#define WC WB -#define HC HA - -// Thread block size -#define BLOCK_SIZE 16 - -inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - fprintf(stderr, "ERROR: %s\n", name); - exit(EXIT_FAILURE); - } -} - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -// Main -int main(int argc, char** argv) { - - // seed for rand() - srand(2006); - - // Allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); - - unsigned int size_B = WB * HB; - unsigned int bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ - - // Allocate host memory for the result matrix C - unsigned int size_C = WC * HC; - unsigned int bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); - - // Initialize OpenCL - - // OpenCL specific variables - cl_context clGPUContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; - - size_t dataBytes; - size_t kernelLength; - cl_int errcode; - - // OpenCL device memory for matrices - cl_mem d_A; - cl_mem d_B; - cl_mem d_C; - - /*****************************************/ - /* Initialize OpenCL */ - /*****************************************/ - clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, - NULL, NULL, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // get the list of GPU devices associated with context - errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); - cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - shrCheckError(errcode, CL_SUCCESS); - - //Create a command-queue - clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // Setup device memory - d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, - &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_B, h_B, &errcode); - - - // Load and build OpenCL kernel - char *clMatrixMul = oclLoadProgSource("kernel.cl", - "// My comment\n", - &kernelLength); - shrCheckError(clMatrixMul != NULL, shrTRUE); - - clProgram = clCreateProgramWithSource(clGPUContext, 1, - (const char **)&clMatrixMul, - &kernelLength, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); - shrCheckError(errcode, CL_SUCCESS); - - - // Launch OpenCL kernel - size_t localWorkSize[2], globalWorkSize[2]; - - int hA = HA; - int wA = WA; - int wC = WC; - errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); - errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); - errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); - errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&hA); - errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wA); - errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&wC); - shrCheckError(errcode, CL_SUCCESS); - - localWorkSize[0] = BLOCK_SIZE; - localWorkSize[1] = BLOCK_SIZE; - globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - - errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Retrieve result from device - errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, - h_C, 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Print out the result -/* - printf("\n\nMatrix C (Result)\n"); - for(int i = 0; i < size_C; i++) { - printf("%f ", h_C[i]); - if(((i + 1) % WC) == 0) - printf("\n"); - } - printf("\n"); - */ - printf("\nDone!\n"); - - // Deallocate memory - free(h_A); - free(h_B); - free(h_C); - - clReleaseMemObject(d_A); - clReleaseMemObject(d_C); - clReleaseMemObject(d_B); - - free(clDevices); - free(clMatrixMul); - clReleaseContext(clGPUContext); - clReleaseKernel(clKernel); - clReleaseProgram(clProgram); - clReleaseCommandQueue(clCommandQue); - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl b/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl deleted file mode 100644 index db89aba5a37b87af73d33560a120b125f9f1a921..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl +++ /dev/null @@ -1,54 +0,0 @@ -// Tile size -#define TILE_SIZE 16 - -// OpenCL Kernel for matrix multiply, C = A * B -// using shared memory -__kernel void matrixMul_sh(__global float* C, - __global float* A, - __global float* B, - int k, - int n) { - - int tx = get_local_id(0); //2D Local Thread ID x - int ty = get_local_id(1); //2D Local Thread ID y - - int bx = get_group_id(0); //2D Block ID x - int by = get_group_id(1); //2D Block ID y - - int col = bx * TILE_SIZE + tx; - int row = by * TILE_SIZE + ty; - - // Static work-group (thread block) local (shared) memory allocations - __local float A_s[TILE_SIZE][TILE_SIZE]; - __local float B_s[TILE_SIZE][TILE_SIZE]; - - // Initialize accumulator - float res = 0.0f; - - int i,l; - - for (l = 0; l < k/TILE_SIZE; l++) { - // Transfer tiles of A and B to local (shared) memory - A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx]; - B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col]; - - // Barrier to synchronize all threads - barrier(CLK_LOCAL_MEM_FENCE); - // Now the local submatricies A_s and B_s are valid - - /* Multiply the two submatrices. Each thread computes one element of the * - * block submatrix */ - for (i = 0; i < TILE_SIZE; i++) - res += A_s[ty][i]*B_s[i][tx]; - - /* Barrier: calculations must be completed before next memory transfer * - * can start */ - barrier(CLK_LOCAL_MEM_FENCE); - - } - - // Write in device memory - C[row * n + col] = res; - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c deleted file mode 100644 index c1c3a300668b94f904393074bb92874715ac5e25..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -void matrixMultiply(float *C, float *A, float *B, int k, int n); - -/* -// Host matrix multiply -void matrixMulHost (int m, k, n, int *A, int *B, int *C) { - - for (int i = 0; i < m; i++) - for (int j = 0; j < n; j++) - for (int t = 0; t < k; t++) - C[i*n + j] = A[i*k + t] + B[t*n + j]; - - return; -} -*/ - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -int main (int argc, char *argv[]) { - int m = atoi(argv[1]); - int k = atoi(argv[2]); - int n = atoi(argv[3]); - - float *A, *B, *C; - - /******************************************************************** - Allocate memory and initialize the input/output vectors - ********************************************************************/ - - A = (float *) malloc(m*k*sizeof(float)); - B = (float *) malloc(k*n*sizeof(float)); - C = (float *) malloc(m*n*sizeof(float)); - - randomInit(A, m*k); - randomInit(B, k*n); - - matrixMultiply(C, A, B, k, n); - - /******************************************************************** - Free memory allocations - ********************************************************************/ - - free(A); free(B); free(C); - - return 0; -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile b/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile deleted file mode 100644 index 89793871e5966a0cd45ea25fe9a18d4941b8ef65..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -PASSES := - -.PHONY: clean - -LIBCLC:=/home/kotsifa2/llvm/libclc -HOST:=sgemm -KERNELS:=matrixMul_sh_bc - -all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) - -$(KERNELS:%=%.ll):%.ll:%.cl - clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ - -$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ - -$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - clang -target nvptx $< -S -o $@ - -$(HOST:%=%.ll):%.ll:%.c - clang -O3 -S -emit-llvm $< -o $@ - -clean : - rm -f *.ll *.bc *.s diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c deleted file mode 100644 index 2c41a20814f82473802e1efc54e82dbad59ee02d..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c +++ /dev/null @@ -1,193 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <math.h> -#include <oclUtils.h> - -#define WA 1024 -#define HA 1024 -#define WB 1024 -#define HB WA -#define WC WB -#define HC HA - -// Thread block size -#define BLOCK_SIZE 16 - -inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - fprintf(stderr, "ERROR: %s\n", name); - exit(EXIT_FAILURE); - } -} - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -// Main -int main(int argc, char** argv) { - - // seed for rand() - srand(2006); - - // Allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int bytes_A = sizeof(float) * size_A; - float* h_A = (float*) malloc(bytes_A); - - unsigned int size_B = WB * HB; - unsigned int bytes_B = sizeof(float) * size_B; - float* h_B = (float*) malloc(bytes_B); - - // Initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); - -/* - // Print A and B - printf("\n\nMatrix A\n"); - for(int i = 0; i < size_A; i++) - { - printf("%f ", h_A[i]); - if(((i + 1) % WA) == 0) - printf("\n"); - } - - printf("\n\nMatrix B\n"); - for(int i = 0; i < size_B; i++) - { - printf("%f ", h_B[i]); - if(((i + 1) % WB) == 0) - printf("\n"); - } -*/ - - // Allocate host memory for the result matrix C - unsigned int size_C = WC * HC; - unsigned int bytes_C = sizeof(float) * size_C; - float* h_C = (float*) malloc(bytes_C); - - // Initialize OpenCL - - // OpenCL specific variables - cl_context clGPUContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; - - size_t dataBytes; - size_t kernelLength; - cl_int errcode; - - // OpenCL device memory for matrices - cl_mem d_A; - cl_mem d_B; - cl_mem d_C; - - /*****************************************/ - /* Initialize OpenCL */ - /*****************************************/ - clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, - NULL, NULL, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // get the list of GPU devices associated with context - errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); - cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - shrCheckError(errcode, CL_SUCCESS); - - //Create a command-queue - clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - // Setup device memory - d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, - &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_B, h_B, &errcode); - - - // Load and build OpenCL kernel - char *clMatrixMul = oclLoadProgSource("kernel.cl", - "// My comment\n", - &kernelLength); - shrCheckError(clMatrixMul != NULL, shrTRUE); - - clProgram = clCreateProgramWithSource(clGPUContext, 1, - (const char **)&clMatrixMul, - &kernelLength, &errcode); - shrCheckError(errcode, CL_SUCCESS); - - errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); - shrCheckError(errcode, CL_SUCCESS); - - - // Launch OpenCL kernel - size_t localWorkSize[2], globalWorkSize[2]; - - int hA = HA; - int wA = WA; - int wC = WC; - errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); - errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); - errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); - errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&hA); - errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wA); - errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&wC); - shrCheckError(errcode, CL_SUCCESS); - - localWorkSize[0] = BLOCK_SIZE; - localWorkSize[1] = BLOCK_SIZE; - globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - - errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Retrieve result from device - errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, - h_C, 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - - // Print out the result -/* - printf("\n\nMatrix C (Result)\n"); - for(int i = 0; i < size_C; i++) { - printf("%f ", h_C[i]); - if(((i + 1) % WC) == 0) - printf("\n"); - } - printf("\n"); - */ - printf("\nDone!\n"); - - // Deallocate memory - free(h_A); - free(h_B); - free(h_C); - - clReleaseMemObject(d_A); - clReleaseMemObject(d_C); - clReleaseMemObject(d_B); - - free(clDevices); - free(clMatrixMul); - clReleaseContext(clGPUContext); - clReleaseKernel(clKernel); - clReleaseProgram(clProgram); - clReleaseCommandQueue(clCommandQue); - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl b/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl deleted file mode 100644 index ffa734eaa37eac8e85c4a884ce9d9f848b1ba970..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl +++ /dev/null @@ -1,92 +0,0 @@ -// Tile size -#define TILE_SIZE 16 - -// OpenCL Kernel for matrix multiply, C = A * B -// using shared memory and with boundary conditions -__kernel void matrixMul(__global float* C, - __global float* A, - __global float* B, - int m, - int k, - int n) { - - int tx = get_local_id(0); //2D Local Thread ID x - int ty = get_local_id(1); //2D Local Thread ID y - - int bx = get_group_id(0); //2D Block ID x - int by = get_group_id(1); //2D Block ID y - - int col = bx * TILE_SIZE + tx; - int row = by * TILE_SIZE + ty; - - // Static work-group (thread block) local (shared) memory allocations - __local float A_s[TILE_SIZE][TILE_SIZE]; - __local float B_s[TILE_SIZE][TILE_SIZE]; - - // Loop counters - int i, l; - - // Initialize accumulator - float res = 0.0f; - - /* In the tiled version of matrix multiplication for arbitrary sizes, * - * threads that are not matched with an element of the output matrix may * - * still participate in the memory transfer */ - - for (l = 0; l < k/TILE_SIZE; l++) { - /* Loop for calculating with interior tiles (as far as k is concerned) * - * Threads may still exceed the bottom limit of matrix A and the right * - * limit of matrix B */ - - /* check: thread does not exceed bottom limit of A */ - if (row < m) - A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx]; - - /* check: thread does not exceed right limit of B */ - if (col < n) - B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col]; - - /* Barrier: memory transfer must be completed before calculations start */ - barrier(CLK_LOCAL_MEM_FENCE); - - /* The threads that are within limits of output matrix accumulate in * - * their local result */ - if ((row < m) && (col < n)) - for (i = 0; i < TILE_SIZE; i++) - res += A_s[ty][i]*B_s[i][tx]; - - /* Barrier: calculations must be over before next memory transfer starts */ - barrier(CLK_LOCAL_MEM_FENCE); - } - - /* For this last iteration, which will occur if matrices' sizes are not * - * proportional to the tile size, we must take care not to exceed the right * - * limit of A and the bottom limit of B as well */ - - /* 1st check: thread does not exceed right limit of A * - * 2st check: thread does not exceed bottom limit of A */ - if ((TILE_SIZE*l + tx < k) && (row < m)) - A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx]; - - /* 1st check: thread does not exceed bottom limit of B * - * 2st check: thread does not exceed right limit of B */ - if ((TILE_SIZE*l + ty < k) && (col < n)) - B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col]; - - /* Barrier: memory transfer must be completed before calculations can start */ - barrier(CLK_LOCAL_MEM_FENCE); - - /* The threads that are within limits of output matrix accumulate in their * - * local result */ - if ((row < m) && (col < n)) { - for (i = 0; i < min(TILE_SIZE, k - TILE_SIZE*l); i++) - res += A_s[ty][i]*B_s[i][tx]; - /* Synchronization is not necessary, because each thread within limits * - * of output matrix copies back its own private result */ - C[row*n + col] = res; - } - - return; - -} - diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c deleted file mode 100644 index 1cf29a212c7eddab4f0e7c82da832f1a3f589e6a..0000000000000000000000000000000000000000 --- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -void matrixMultiply(float *C, float *A, float *B, int m, int k, int n); - -/* -// Host matrix multiply -void matrixMulHost (int m, k, n, int *A, int *B, int *C) { - - for (int i = 0; i < m; i++) - for (int j = 0; j < n; j++) - for (int t = 0; t < k; t++) - C[i*n + j] = A[i*k + t] + B[t*n + j]; - - return; -} -*/ - -// Allocates a matrix with random float entries. -void randomInit(float* data, int size) { - for (int i = 0; i < size; ++i) - data[i] = rand() / (float)RAND_MAX; -} - -int main (int argc, char *argv[]) { - int m = atoi(argv[1]); - int k = atoi(argv[2]); - int n = atoi(argv[3]); - - float *A, *B, *C; - - /******************************************************************** - Allocate memory and initialize the input/output vectors - ********************************************************************/ - - A = (float *) malloc(m*k*sizeof(float)); - B = (float *) malloc(k*n*sizeof(float)); - C = (float *) malloc(m*n*sizeof(float)); - - randomInit(A, m*k); - randomInit(B, k*n); - - matrixMultiply(C, A, B, m, k, n); - - /******************************************************************** - Free memory allocations - ********************************************************************/ - - free(A); free(B); free(C); - - return 0; -} - diff --git a/hpvm/test/singleVecNode/Makefile b/hpvm/test/singleVecNode/Makefile deleted file mode 100644 index 5a13636aca764f63fbef04f57e73e7a41bfb394a..0000000000000000000000000000000000000000 --- a/hpvm/test/singleVecNode/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -TARGET := singleVecNode -PASSES := -OPTS := -mllvm -vectorize-slp-aggressive -SOURCEFILES := $(TARGET).ll $(TARGET).opt.ll - -.PHONY: clean - -all: $(SOURCEFILES) - -$(TARGET).opt.ll: $(TARGET).opt.bc - llvm-dis $< > $@ - -$(TARGET).opt.bc: $(TARGET).bc - opt $(PASSES) $< > $@ - -$(TARGET).bc: $(TARGET).ll - llvm-as $< > $@ - -$(TARGET).ll: $(TARGET).c - clang -O3 -emit-llvm -S $(OPTS) $< -o $@ - -clean : - rm -f $(SOURCEFILES) *.bc diff --git a/hpvm/test/singleVecNode/singleVecNode.c b/hpvm/test/singleVecNode/singleVecNode.c deleted file mode 100644 index 88b579b8caa1c929e553ea7c1c13391eb8c2f574..0000000000000000000000000000000000000000 --- a/hpvm/test/singleVecNode/singleVecNode.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -#define N 16 - -int main (int argc, char *argv[]) { - float n = atof(argv[1]); - - float a[N]; - int i; - for(i = 0; i<N; i++) { - a[i] = n; - } - for(i=0; i<N;i++) { - a[i] = a[i] + i; - } - for(i=0; i<N;i++) { - printf("a[%d] = %f\n", i, a[i]); - } - - return 0; -} - diff --git a/hpvm/test/singleVecNode/visc_singleVecNode.ll b/hpvm/test/singleVecNode/visc_singleVecNode.ll deleted file mode 100644 index 58d0a9a3c79a9ea5990ab2db2ebedab55a4bf929..0000000000000000000000000000000000000000 --- a/hpvm/test/singleVecNode/visc_singleVecNode.ll +++ /dev/null @@ -1,90 +0,0 @@ -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@.str = private unnamed_addr constant [12 x i8] c"a[%d] = %f\0A\00", align 1 - -declare i8* @llvm.visc.createNode(i8*) - -@llvm.visc.root = global i32 ()* @Root - -; Function for llvm.visc.root - -define i32 @Root() { - %init_node = call i8* @llvm.visc.createNode(i8* bitcast (i32 (i32, i8**)* @main to i8*)) - ret i32 0 -} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -middle.block: - %a = alloca [16 x float], align 16 - %arrayidx = getelementptr inbounds i8** %argv, i64 1 - %0 = load i8** %arrayidx, align 8, !tbaa !0 - %call.i = call double @strtod(i8* nocapture %0, i8** null) #1 - %1 = bitcast [16 x float]* %a to i8* - call void @llvm.lifetime.start(i64 64, i8* %1) #1 - %conv = fptrunc double %call.i to float - %broadcast.splatinsert63 = insertelement <4 x float> undef, float %conv, i32 0 - %broadcast.splat64 = shufflevector <4 x float> %broadcast.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer - %2 = bitcast [16 x float]* %a to <4 x float>* - store <4 x float> %broadcast.splat64, <4 x float>* %2, align 16 - %3 = getelementptr inbounds [16 x float]* %a, i64 0, i64 4 - %4 = bitcast float* %3 to <4 x float>* - store <4 x float> %broadcast.splat64, <4 x float>* %4, align 16 - %5 = getelementptr inbounds [16 x float]* %a, i64 0, i64 8 - %6 = bitcast float* %5 to <4 x float>* - store <4 x float> %broadcast.splat64, <4 x float>* %6, align 16 - %7 = getelementptr inbounds [16 x float]* %a, i64 0, i64 12 - %8 = bitcast float* %7 to <4 x float>* - store <4 x float> %broadcast.splat64, <4 x float>* %8, align 16 - %wide.load = load <4 x float>* %2, align 16 - %9 = fadd <4 x float> %wide.load, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> - store <4 x float> %9, <4 x float>* %2, align 16 - %wide.load.1 = load <4 x float>* %4, align 16 - %10 = fadd <4 x float> %wide.load.1, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00> - store <4 x float> %10, <4 x float>* %4, align 16 - %wide.load.2 = load <4 x float>* %6, align 16 - %11 = fadd <4 x float> %wide.load.2, <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01> - store <4 x float> %11, <4 x float>* %6, align 16 - %wide.load.3 = load <4 x float>* %8, align 16 - %12 = fadd <4 x float> %wide.load.3, <float 1.200000e+01, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01> - store <4 x float> %12, <4 x float>* %8, align 16 - br label %for.body18 - -for.body18: ; preds = %for.body18, %middle.block - %indvars.iv = phi i64 [ 0, %middle.block ], [ %indvars.iv.next, %for.body18 ] - %arrayidx20 = getelementptr inbounds [16 x float]* %a, i64 0, i64 %indvars.iv - %13 = load float* %arrayidx20, align 4, !tbaa !3 - %conv21 = fpext float %13 to double - %14 = trunc i64 %indvars.iv to i32 - %call22 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0), i32 %14, double %conv21) #1 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 16 - br i1 %exitcond, label %for.end25, label %for.body18 - -for.end25: ; preds = %for.body18 - call void @llvm.lifetime.end(i64 64, i8* %1) #1 - ret i32 0 -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -declare i32 @printf(i8* nocapture, ...) #2 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -declare double @strtod(i8*, i8** nocapture) #2 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!0 = metadata !{metadata !"any pointer", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{metadata !"float", metadata !1} diff --git a/hpvm/test/testKernel/kernel-spir32.ll b/hpvm/test/testKernel/kernel-spir32.ll deleted file mode 100644 index e5c403f87b49073edfe691515d47d42b7846933c..0000000000000000000000000000000000000000 --- a/hpvm/test/testKernel/kernel-spir32.ll +++ /dev/null @@ -1,38 +0,0 @@ -; ModuleID = '/tmp/qt_temp.w24812' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -target triple = "spir-unknown-unknown" - -define cc76 void @mysgemmNT(float addrspace(1)* nocapture %A, float addrspace(1)* nocapture %B, float addrspace(1)* nocapture %C) nounwind { - %1 = tail call cc75 i32 @_Z13get_global_idj(i32 0) nounwind readnone - %2 = getelementptr inbounds float addrspace(1)* %A, i32 %1 - %3 = load float addrspace(1)* %2, align 4, !tbaa !9 - %4 = getelementptr inbounds float addrspace(1)* %B, i32 %1 - %5 = load float addrspace(1)* %4, align 4, !tbaa !9 - %6 = fmul float %3, %5 - %7 = getelementptr inbounds float addrspace(1)* %C, i32 %1 - store float %6, float addrspace(1)* %7, align 4, !tbaa !9 - ret void -} - -declare cc75 i32 @_Z13get_global_idj(i32) nounwind readnone - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!7} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @mysgemmNT, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6} -!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1} -!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"} -!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""} -!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"} -!7 = metadata !{i32 1, i32 2} -!8 = metadata !{} -!9 = metadata !{metadata !"float", metadata !10} -!10 = metadata !{metadata !"omnipotent char", metadata !11} -!11 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/testKernel/kernel-spir64.ll b/hpvm/test/testKernel/kernel-spir64.ll deleted file mode 100644 index 9d97d957fa316110c5461cbd329e731491dc5c89..0000000000000000000000000000000000000000 --- a/hpvm/test/testKernel/kernel-spir64.ll +++ /dev/null @@ -1,40 +0,0 @@ -; ModuleID = '/tmp/qt_temp.w24812' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -target triple = "spir64-unknown-unknown" - -define cc76 void @mysgemmNT(float addrspace(1)* nocapture %A, float addrspace(1)* nocapture %B, float addrspace(1)* nocapture %C) nounwind { - %1 = tail call cc75 i64 @_Z13get_global_idj(i32 0) nounwind readnone - %sext = shl i64 %1, 32 - %2 = ashr exact i64 %sext, 32 - %3 = getelementptr inbounds float addrspace(1)* %A, i64 %2 - %4 = load float addrspace(1)* %3, align 4, !tbaa !9 - %5 = getelementptr inbounds float addrspace(1)* %B, i64 %2 - %6 = load float addrspace(1)* %5, align 4, !tbaa !9 - %7 = fmul float %4, %6 - %8 = getelementptr inbounds float addrspace(1)* %C, i64 %2 - store float %7, float addrspace(1)* %8, align 4, !tbaa !9 - ret void -} - -declare cc75 i64 @_Z13get_global_idj(i32) nounwind readnone - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!7} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @mysgemmNT, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6} -!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1} -!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"} -!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""} -!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"} -!7 = metadata !{i32 1, i32 2} -!8 = metadata !{} -!9 = metadata !{metadata !"float", metadata !10} -!10 = metadata !{metadata !"omnipotent char", metadata !11} -!11 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/hpvm/test/testKernel/kernel.asm b/hpvm/test/testKernel/kernel.asm deleted file mode 100644 index fe6cadaf41b3683ae89710711ea763f76096edf3..0000000000000000000000000000000000000000 --- a/hpvm/test/testKernel/kernel.asm +++ /dev/null @@ -1,83 +0,0 @@ - .file "main" - .text - .globl mysgemmNT - .align 16, 0x90 - .type mysgemmNT,@function -mysgemmNT: - .cfi_startproc - pushq %rbp -.Ltmp3: - .cfi_def_cfa_offset 16 -.Ltmp4: - .cfi_offset %rbp, -16 - movq %rsp, %rbp -.Ltmp5: - .cfi_def_cfa_register %rbp - pushq %r14 - pushq %rbx - andq $-8, %rsp -.Ltmp6: - .cfi_offset %rbx, -32 -.Ltmp7: - .cfi_offset %r14, -24 - movq (%rsi), %r9 - movq 32(%rdi), %r8 - movq 16(%rdi), %r14 - movq (%rdi), %rcx - movq 8(%rdi), %rdx - movq 80(%rdi), %r10 - movq %r10, %rsi - sarq $2, %rsi - je .LBB0_3 - movl %r10d, %eax - imull %r9d, %eax - addl %r8d, %eax - shlq $32, %rax - movabsq $17179869184, %r11 - movq %rsi, %rbx - .align 16, 0x90 -.LBB0_2: - movq %rax, %rdi - sarq $32, %rdi - vmovups (%rdx,%rdi,4), %xmm0 - vmulps (%rcx,%rdi,4), %xmm0, %xmm0 - vmovups %xmm0, (%r14,%rdi,4) - addq %r11, %rax - decq %rbx - jne .LBB0_2 -.LBB0_3: - movq %r10, %rax - andq $-4, %rax - cmpq %rax, %r10 - je .LBB0_6 - shlq $2, %rsi - movq %r10, %rdi - subq %rsi, %rdi - negq %rdi - imull %r9d, %r10d - addl %r10d, %r8d - addl %r8d, %esi - shlq $32, %rsi - movabsq $4294967296, %r8 - .align 16, 0x90 -.LBB0_5: - movq %rsi, %rax - sarq $32, %rax - vmovss (%rcx,%rax,4), %xmm0 - vmulss (%rdx,%rax,4), %xmm0, %xmm0 - vmovss %xmm0, (%r14,%rax,4) - addq %r8, %rsi - incq %rdi - jne .LBB0_5 -.LBB0_6: - leaq -16(%rbp), %rsp - popq %rbx - popq %r14 - popq %rbp - ret -.Ltmp8: - .size mysgemmNT, .Ltmp8-mysgemmNT - .cfi_endproc - - - .section ".note.GNU-stack","",@progbits diff --git a/hpvm/test/testKernel/kernel.cl b/hpvm/test/testKernel/kernel.cl deleted file mode 100644 index 235a6498c4a01d43af7933f1bc2ef585a80fdedf..0000000000000000000000000000000000000000 --- a/hpvm/test/testKernel/kernel.cl +++ /dev/null @@ -1,18 +0,0 @@ -/*************************************************************************** - *cr - *cr (C) Copyright 2010 The Board of Trustees of the - *cr University of Illinois - *cr All Rights Reserved - *cr - ***************************************************************************/ - -/* - * Kernel of dense matrix-matrix multiplication kernel. - */ - -__kernel void mysgemmNT( __global const float *A, __global const float *B, __global float* C) -{ - int m = get_global_id(0); - - C[m] = A[m] * B[m]; -} diff --git a/hpvm/test/testKernel/kernel.ll b/hpvm/test/testKernel/kernel.ll deleted file mode 100644 index 03a29ff6979b9648bae22f3caf6722e102fd78dd..0000000000000000000000000000000000000000 --- a/hpvm/test/testKernel/kernel.ll +++ /dev/null @@ -1,154 +0,0 @@ -; ModuleID = 'main' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-pc-linux" - -; Function Attrs: nounwind -declare void @__mysgemmNT_before.AddImplicitArgs(float addrspace(1)* nocapture, float addrspace(1)* nocapture, float addrspace(1)* nocapture) #0 - -; Function Attrs: nounwind readnone -declare i64 @_Z13get_global_idj(i32) #1 - -declare [7 x i64] @__WG.boundaries.mysgemmNT_before.AddImplicitArgs(float addrspace(1)*, float addrspace(1)*, float addrspace(1)*) - -declare i64 @_Z14get_local_sizej(i32) - -declare i64 @get_base_global_id.(i32) - -declare i1 @__ocl_allOne(i1) - -declare i1 @__ocl_allZero(i1) - -; Function Attrs: alwaysinline nounwind -declare void @__mysgemmNT_separated_args(float addrspace(1)* nocapture, float addrspace(1)* nocapture, float addrspace(1)* nocapture, i8 addrspace(3)* noalias, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }* noalias, i64* noalias, [4 x i64], i8* noalias, {}* noalias) #2 - -declare [7 x i64] @WG.boundaries.mysgemmNT(float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)* noalias, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }* noalias, i64* noalias, [4 x i64], i8* noalias, {}* noalias) - -define void @mysgemmNT(i8* noalias %pUniformArgs, i64* noalias %pWGId, {}* noalias %RuntimeHandle) { -wrapper_entry: - %0 = bitcast i8* %pUniformArgs to float addrspace(1)** - %explicit_0 = load float addrspace(1)** %0, align 8 - %1 = getelementptr i8* %pUniformArgs, i64 8 - %2 = bitcast i8* %1 to float addrspace(1)** - %explicit_1 = load float addrspace(1)** %2, align 8 - %3 = getelementptr i8* %pUniformArgs, i64 16 - %4 = bitcast i8* %3 to float addrspace(1)** - %explicit_2 = load float addrspace(1)** %4, align 8 - %5 = getelementptr i8* %pUniformArgs, i64 80 - %6 = bitcast i8* %5 to i64* - %LocalSize_0 = load i64* %6, align 8 - %7 = getelementptr i8* %pUniformArgs, i64 32 - %8 = bitcast i8* %7 to i64* - %GlobalOffset_0 = load i64* %8, align 8 - %GroupID_0 = load i64* %pWGId, align 8 - %vector.size.i = ashr i64 %LocalSize_0, 2 - %num.vector.wi.i = shl nsw i64 %vector.size.i, 2 - %9 = icmp eq i64 %vector.size.i, 0 - br i1 %9, label %scalarIf.i, label %dim_0_vector_pre_head.i.preheader - -dim_0_vector_pre_head.i.preheader: ; preds = %wrapper_entry - %10 = mul i64 %LocalSize_0, %GroupID_0 - %11 = add i64 %GlobalOffset_0, %10 - %12 = mul i64 %11, 4294967296 - br label %dim_0_vector_pre_head.i - -dim_0_vector_pre_head.i: ; preds = %dim_0_vector_pre_head.i.preheader, %dim_0_vector_pre_head.i - %lsr.iv5 = phi i64 [ %12, %dim_0_vector_pre_head.i.preheader ], [ %lsr.iv.next6, %dim_0_vector_pre_head.i ] - %lsr.iv3 = phi i64 [ %vector.size.i, %dim_0_vector_pre_head.i.preheader ], [ %lsr.iv.next4, %dim_0_vector_pre_head.i ] - %extractvector_func.i = ashr exact i64 %lsr.iv5, 32 - %13 = getelementptr inbounds float addrspace(1)* %explicit_0, i64 %extractvector_func.i - %ptrTypeCastvector_func.i = bitcast float addrspace(1)* %13 to <4 x float> addrspace(1)* - %14 = load <4 x float> addrspace(1)* %ptrTypeCastvector_func.i, align 1 - %15 = getelementptr inbounds float addrspace(1)* %explicit_1, i64 %extractvector_func.i - %ptrTypeCast5vector_func.i = bitcast float addrspace(1)* %15 to <4 x float> addrspace(1)* - %16 = load <4 x float> addrspace(1)* %ptrTypeCast5vector_func.i, align 1 - %17 = fmul <4 x float> %14, %16 - %18 = getelementptr inbounds float addrspace(1)* %explicit_2, i64 %extractvector_func.i - %ptrTypeCast6vector_func.i = bitcast float addrspace(1)* %18 to <4 x float> addrspace(1)* - store <4 x float> %17, <4 x float> addrspace(1)* %ptrTypeCast6vector_func.i, align 1 - %lsr.iv.next4 = add i64 %lsr.iv3, -1 - %lsr.iv.next6 = add i64 %lsr.iv5, 17179869184 - %dim_0_vector_cmp.to.max.i = icmp eq i64 %lsr.iv.next4, 0 - br i1 %dim_0_vector_cmp.to.max.i, label %scalarIf.i, label %dim_0_vector_pre_head.i - -scalarIf.i: ; preds = %dim_0_vector_pre_head.i, %wrapper_entry - %19 = icmp eq i64 %LocalSize_0, %num.vector.wi.i - br i1 %19, label %__mysgemmNT_separated_args.exit, label %scalar_kernel_entry.i.preheader - -scalar_kernel_entry.i.preheader: ; preds = %scalarIf.i - %20 = mul i64 %vector.size.i, 4 - %21 = sub i64 %LocalSize_0, %20 - %22 = mul i64 %LocalSize_0, %GroupID_0 - %23 = add i64 %GlobalOffset_0, %22 - %24 = add i64 %23, %20 - %25 = mul i64 %24, 4294967296 - %26 = sub i64 0, %21 - br label %scalar_kernel_entry.i - -scalar_kernel_entry.i: ; preds = %scalar_kernel_entry.i.preheader, %scalar_kernel_entry.i - %lsr.iv7 = phi i64 [ %26, %scalar_kernel_entry.i.preheader ], [ %lsr.iv.next8, %scalar_kernel_entry.i ] - %lsr.iv1 = phi i64 [ %25, %scalar_kernel_entry.i.preheader ], [ %lsr.iv.next2, %scalar_kernel_entry.i ] - %27 = ashr exact i64 %lsr.iv1, 32 - %28 = getelementptr inbounds float addrspace(1)* %explicit_0, i64 %27 - %29 = load float addrspace(1)* %28, align 1 - %30 = getelementptr inbounds float addrspace(1)* %explicit_1, i64 %27 - %31 = load float addrspace(1)* %30, align 1 - %32 = fmul float %29, %31 - %33 = getelementptr inbounds float addrspace(1)* %explicit_2, i64 %27 - store float %32, float addrspace(1)* %33, align 1 - %lsr.iv.next2 = add i64 %lsr.iv1, 4294967296 - %lsr.iv.next8 = add i64 %lsr.iv7, 1 - %dim_0_cmp.to.max.i = icmp eq i64 %lsr.iv.next8, 0 - br i1 %dim_0_cmp.to.max.i, label %__mysgemmNT_separated_args.exit, label %scalar_kernel_entry.i - -__mysgemmNT_separated_args.exit: ; preds = %scalar_kernel_entry.i, %scalarIf.i - ret void -} - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!7} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} -!opencl.kernel_info = !{!9} -!opencl.module_info_list = !{!26} -!llvm.functions_info = !{} -!opencl.functions_stats = !{} -!opencl.stat_descriptions = !{} -!opencl.module_stat_info = !{} - -!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)*, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }*, i64*, [4 x i64], i8*, {}*)* @__mysgemmNT_separated_args, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6} -!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1} -!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"} -!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""} -!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"} -!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"} -!7 = metadata !{i32 1, i32 2} -!8 = metadata !{} -!9 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)*, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }*, i64*, [4 x i64], i8*, {}*)* @__mysgemmNT_separated_args, metadata !10} -!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25} -!11 = metadata !{metadata !"local_buffer_size", i32 0} -!12 = metadata !{metadata !"barrier_buffer_size", i32 0} -!13 = metadata !{metadata !"kernel_execution_length", i32 11} -!14 = metadata !{metadata !"max_wg_dimensions", i32 1} -!15 = metadata !{metadata !"kernel_has_barrier", i1 false} -!16 = metadata !{metadata !"kernel_has_global_sync", i1 false} -!17 = metadata !{metadata !"no_barrier_path", i1 true} -!18 = metadata !{metadata !"vectorized_kernel", null} -!19 = metadata !{metadata !"vectorized_width", i32 4} -!20 = metadata !{metadata !"kernel_wrapper", void (i8*, i64*, {}*)* @mysgemmNT} -!21 = metadata !{metadata !"scalarized_kernel", null} -!22 = metadata !{metadata !"block_literal_size", null} -!23 = metadata !{metadata !"private_memory_size", i32 0} -!24 = metadata !{metadata !"vectorization_dimension", i32 0} -!25 = metadata !{metadata !"can_unite_workgroups", i1 true} -!26 = metadata !{metadata !27, metadata !28, metadata !29} -!27 = metadata !{metadata !"global_variable_total_size", i64 0} -!28 = metadata !{metadata !"gen_addr_space_pointer_counter", null} -!29 = metadata !{metadata !"gen_addr_space_pointer_warnings"} diff --git a/hpvm/test/vectorAdd/Makefile b/hpvm/test/vectorAdd/Makefile deleted file mode 100644 index ecc4e035e87f67283563676aef7f516284055ef6..0000000000000000000000000000000000000000 --- a/hpvm/test/vectorAdd/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -PASSES := -SOURCEFILES := vecadd.ll vecadd.opt.ll - -.PHONY: clean - -all: $(SOURCEFILES) - -vecadd.opt.ll: vecadd.opt.bc - llvm-dis $< > $@ - -vecadd.opt.bc: vecadd.bc - opt $(PASSES) $< > $@ - -vecadd.bc: vecadd.ll - llvm-as $< > $@ - -vecadd.ll: vecadd.c - clang -O3 -emit-llvm -S $< -o $@ - -clean : - rm -f $(SOURCEFILES) *.bc diff --git a/hpvm/test/vectorAdd/old_visc_vecadd.ll b/hpvm/test/vectorAdd/old_visc_vecadd.ll deleted file mode 100644 index c88f20378b3708e0b13e7625c92ee8aa22b1997f..0000000000000000000000000000000000000000 --- a/hpvm/test/vectorAdd/old_visc_vecadd.ll +++ /dev/null @@ -1,250 +0,0 @@ -; ModuleID = 'vecadd.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%allocateResult_ty = struct { i32* } -define void @allocateResult(i32 %size, %allocateResult_ty* %out) { - %sext = shl i64 %size, 32 - %0 = ashr exact i64 %sext, 30 - %1 = tail call noalias i8* @malloc(i64 %0) #3 - %2 = bitcast i8* %1 to i32* - - %ptr = getelementptr inbounds %allocateResult_ty* *%out, i64 0, i64 0 - store i32* %2, %ptr - ret void -} - -%vecadd_ty = struct { i32* } -define void @vecadd(i32* %adata, i32* %bdata, i32* %cdata, %vecadd_ty* %out) { -entry: - %node = call i8* @llvm.visc.getNode() - %idx = call i32 @llvm.VISC.getNodeInstanceID(%node) - - %idxprom = sext i32 %idx to i64 - %arrayidxa = getelementptr inbounds i32* %adata, i64 %idxprom - %a = load i32* %arrayidxa - %arrayidxb = getelementptr inbounds i32* %bdata, i64 %idxprom - %b = load i32* %arrayidxb - - %add = add nsw i32 %a, %b - %arrayidxc = getelementptr inbounds i32* %cdata, i64 %idxprom - store i32 %add, i32* %arrayidxc, align 4 - - %ptr = getelementptr inbounds %vecadd_ty* *%out, i64 0, i64 0 - store i32* %cdata, %ptr - ret void -} - -%wrapperKernelFunction_ty = struct { i32* } -define void @wrapperKernelFunction(i32 %n, i32* nocapture %A, i32* nocapture %B, %wrapperKernelFunction_ty* %out) { - - %node = call i8* @llvm.visc.getNode() - - %allocate_result_node = call i8* @llvm.visc.createNode(@allocateResult) - %vecadd_node = call i8* @llvm.visc.createNode1D(@vecadd, %n) - - %edge0 = call i8* @llvm.visc.createEdge(%node, %allocate_result_node, @fmap4, @argmap4) - %edge1 = call i8* @llvm.visc.createEdge(%node, %vecadd_node, @fmap5, @argmap5) - %edge2 = call i8* @llvm.visc.createEdge(%allocate_result_node, %vecadd_node, @fmap6, @argmap6) - %edge3 = call i8* @llvm.visc.createEdge(%vecadd_node, %node, @fmap7, @argmap7) - -} - -; This function does not return a result. Is type of %out void* ? -define i32 @cleanupFunction(i8* %A, i8* %B, i8* %C, void* %out) { - tail call void @free(i8* %A) #3 - tail call void @free(i8* %B) #3 - tail call void @free(i8* %C) #3 - ret i32 0 -} - -%initializeFunction_ty = struct { i32, i32*, i32* } -define i32 @initializeFunction(i8** %arg, %initializeFunction_ty* %out) { - %1 = getelementptr inbounds i8** %arg, i64 1 - %2 = load i8** %1, align 8, !tbaa !0 - %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #3 - %4 = trunc i64 %3 to i32 - %sext = shl i64 %3, 32 - %5 = ashr exact i64 %sext, 30 - %6 = tail call noalias i8* @malloc(i64 %5) #3 - %7 = bitcast i8* %6 to i32* - %8 = tail call noalias i8* @malloc(i64 %5) #3 - %9 = bitcast i8* %8 to i32* - - %12 = icmp sgt i32 %4, 0 - br i1 %12, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %13 = tail call i32 @rand() #3 - %14 = srem i32 %13, 1000 - %15 = getelementptr inbounds i32* %7, i64 %indvars.iv - store i32 %14, i32* %15, align 4, !tbaa !3 - %16 = tail call i32 @rand() #3 - %17 = srem i32 %16, 1000 - %18 = getelementptr inbounds i32* %9, i64 %indvars.iv - store i32 %17, i32* %18, align 4, !tbaa !3 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %4 - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - %ptrN = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 0 - store i32* %4, %ptrN - %ptrA = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 1 - store i32* %7, %ptrA - %ptrB = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 2 - store i32* %9, %ptrB - ret i32 0 -} ; - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { - - %node = call i8* @llvm.visc.getNode() - - %init_node = call i8* @llvm.visc.createNode(@initializeFunction) - %kernel_node = call i8* @llvm.visc.createNode(@wrapperKernelFunction) - %cleanup_node = call i8* @llvm.visc.createNode(@cleanupFunction) - - %edge0 = call i8* @llvm.visc.createEdge(%node, %init_node, @fmap0, @argmap0) - %edge1 = call i8* @llvm.visc.createEdge(%init_node, %kernel_node, @fmap1, @argmap1) - %edge2 = call i8* @llvm.visc.createEdge(%init_node, %cleanup_node, @fmap2, @argmap2) - %edge3 = call i8* @llvm.visc.createEdge(%kernel_node, %cleanup_node, @fmap3, @argmap3) - - ret i32 0 -} - - -; FUNCTIONS FOR EDGES - -; There is only one instance of both nodes, so this function is not needed -define i1 @fmap0(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap0(i32 %i, i32 %j) { - %0 = icmp eq i32 %i, 1 - %1 = icmp eq i32 %j, 0 - %res = and i1 %0, %1 - ret %res -} - -; There is only one instance of both nodes, so this function is not needed -define i1 @fmap1(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap1(i32 %i, i32 %j) { - %res = icmp eq i32 %i, %j - ret %res -} - -; There is only one instance of both nodes, so this function is not needed -define i1 @fmap2(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap2(i32 %i, i32 %j) { - %0 = icmp eq i32 %i, 1 - %1 = icmp eq i32 %i, 2 - %out_arg = and i1 %0, %1 - - %in_arg = add i32 %j, 1 - %eq = icmp eq i32 %i, %j - - %res = and i1 %out_arg, %eq - ret %res -} - -; There is only one instance of both nodes, so this function is not needed -define i1 @fmap3(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap3(i32 %i, i32 %j) { - %0 = icmp eq i32 %i, 0 - %1 = icmp eq i32 %j, 2 - %res = and i1 %0, %1 - ret %res -} - -; There is only one instance of both nodes, so this function is not needed -define i1 @fmap4(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap4(i32 %i, i32 %j) { - %res = icmp eq i32 %i, %0 - ret %res -} - -; There is an edge from the unique instance of the source node to all the -; instances of the destination node -define i1 @fmap5(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap5(i32 %i, i32 %j) { - %0 = icmp eq i32 %i, 1 - %1 = icmp eq i32 %i, 2 - %out_arg = and i1 %0, %1 - - %in_arg = add i32 %j, 1 - %eq = icmp eq i32 %i, %out_arg - - %res = and i1 out_arg, %eq - ret %res -} - -; There is an edge from the unique instance of the source node to all the -; instances of the destination node -define i1 @fmap6(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap6(i32 %i, i32 %j) { - %0 = icmp eq i32 %i, 0 - %1 = icmp eq i32 %j, 2 - %res = and i1 %0, %1 - ret %res -} - -; There are edges from all the instances of the source node to the unique -; instance of the destination node -define i1 @fmap7(i32 %i, i32 %j) { - ret true -} - -define i1 @argmap7(i32 %i, i32 %j) { - ret true -} - -declare i8* @llvm.visc.getNode() nounwind readnone - -declare i32 @llvm.VISC.getNodeInstanceID(i8*) nounwind readnone - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @rand() #1 - -declare void @vectorAdd(i32, i32*, i32*, i32*) #2 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -; Function Attrs: nounwind -declare i64 @strtol(i8*, i8** nocapture, i32) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } - -!0 = metadata !{metadata !"any pointer", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{metadata !"int", metadata !1} diff --git a/hpvm/test/vectorAdd/vecadd.c b/hpvm/test/vectorAdd/vecadd.c deleted file mode 100644 index ad214a394ab676101d0c2384686f05dccca8a2b1..0000000000000000000000000000000000000000 --- a/hpvm/test/vectorAdd/vecadd.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Main entry of vector addition kernel - */ - -#include <stdio.h> -#include <stdlib.h> - -extern void vectorAdd(int n, int *A, int *B, int *C) ; -/* -{ - - for (int i = 0; i < n; i++) - C[i] = A[i] + B[i]; - - return; -} -*/ - -int main (int argc, char *argv[]) { - int n = atoi(argv[1]); - - int *A, *B, *C; - - /******************************************************************* - * Allocate memory * - *******************************************************************/ -initialize the input/output vectors - A = (int *) malloc(n*sizeof(int)); - B = (int *) malloc(n*sizeof(int)); - C = (int *) malloc(n*sizeof(int)); - - /******************************************************************* - * Initialize memory * - *******************************************************************/ - for (int i = 0; i < n; i++) { - A[i] = rand() % 1000; - B[i] = rand() % 1000; - } - - /******************************************************************* - * Kernel Call * - *******************************************************************/ - vectorAdd(n, A, B, C); - - /******************************************************************* - * Free allocated memory * - *******************************************************************/ - - free(A); free(B); free(C); - - return 0; -} - diff --git a/hpvm/test/vectorAdd/vecadd.ll b/hpvm/test/vectorAdd/vecadd.ll deleted file mode 100644 index 73524b6e5763f928bd273d64046a1b1c94a131a1..0000000000000000000000000000000000000000 --- a/hpvm/test/vectorAdd/vecadd.ll +++ /dev/null @@ -1,212 +0,0 @@ -; ModuleID = 'vecadd.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind uwtable -define void @vectorAdd(i32 %n, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) #0 { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0 - %cnt.cast = zext i32 %n to i64 - %n.vec = and i64 %cnt.cast, 4294967288 - %cmp.zero = icmp eq i64 %n.vec, 0 - %2 = add i32 %n, -1 - %3 = zext i32 %2 to i64 - %scevgep = getelementptr i32* %C, i64 %3 - br i1 %cmp.zero, label %middle.block, label %vector.memcheck - -vector.memcheck: ; preds = %.lr.ph - %scevgep8 = getelementptr i32* %B, i64 %3 - %scevgep5 = getelementptr i32* %A, i64 %3 - %bound111 = icmp uge i32* %scevgep, %B - %bound010 = icmp uge i32* %scevgep8, %C - %bound1 = icmp uge i32* %scevgep, %A - %bound0 = icmp uge i32* %scevgep5, %C - %found.conflict12 = and i1 %bound010, %bound111 - %found.conflict = and i1 %bound0, %bound1 - %conflict.rdx = or i1 %found.conflict, %found.conflict12 - br i1 %conflict.rdx, label %middle.block, label %vector.body - -vector.body: ; preds = %vector.memcheck, %vector.body - %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ] - %4 = getelementptr inbounds i32* %A, i64 %index - %5 = bitcast i32* %4 to <4 x i32>* - %wide.load = load <4 x i32>* %5, align 4 - %.sum21 = or i64 %index, 4 - %6 = getelementptr i32* %A, i64 %.sum21 - %7 = bitcast i32* %6 to <4 x i32>* - %wide.load14 = load <4 x i32>* %7, align 4 - %8 = getelementptr inbounds i32* %B, i64 %index - %9 = bitcast i32* %8 to <4 x i32>* - %wide.load15 = load <4 x i32>* %9, align 4 - %.sum22 = or i64 %index, 4 - %10 = getelementptr i32* %B, i64 %.sum22 - %11 = bitcast i32* %10 to <4 x i32>* - %wide.load16 = load <4 x i32>* %11, align 4 - %12 = add nsw <4 x i32> %wide.load15, %wide.load - %13 = add nsw <4 x i32> %wide.load16, %wide.load14 - %14 = getelementptr inbounds i32* %C, i64 %index - %15 = bitcast i32* %14 to <4 x i32>* - store <4 x i32> %12, <4 x i32>* %15, align 4 - %.sum23 = or i64 %index, 4 - %16 = getelementptr i32* %C, i64 %.sum23 - %17 = bitcast i32* %16 to <4 x i32>* - store <4 x i32> %13, <4 x i32>* %17, align 4 - %index.next = add i64 %index, 8 - %18 = icmp eq i64 %index.next, %n.vec - br i1 %18, label %middle.block, label %vector.body - -middle.block: ; preds = %vector.body, %vector.memcheck, %.lr.ph - %resume.val = phi i64 [ 0, %.lr.ph ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ] - %cmp.n = icmp eq i64 %cnt.cast, %resume.val - br i1 %cmp.n, label %._crit_edge, label %scalar.ph - -scalar.ph: ; preds = %middle.block, %scalar.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %scalar.ph ], [ %resume.val, %middle.block ] - %19 = getelementptr inbounds i32* %A, i64 %indvars.iv - %20 = load i32* %19, align 4, !tbaa !0 - %21 = getelementptr inbounds i32* %B, i64 %indvars.iv - %22 = load i32* %21, align 4, !tbaa !0 - %23 = add nsw i32 %22, %20 - %24 = getelementptr inbounds i32* %C, i64 %indvars.iv - store i32 %23, i32* %24, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %._crit_edge, label %scalar.ph, !llvm.vectorizer.already_vectorized !3 - -._crit_edge: ; preds = %middle.block, %scalar.ph, %0 - ret void -} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { - %1 = getelementptr inbounds i8** %argv, i64 1 - %2 = load i8** %1, align 8, !tbaa !4 - %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #2 - %4 = trunc i64 %3 to i32 - %sext = shl i64 %3, 32 - %5 = ashr exact i64 %sext, 30 - %6 = tail call noalias i8* @malloc(i64 %5) #2 - %7 = bitcast i8* %6 to i32* - %8 = tail call noalias i8* @malloc(i64 %5) #2 - %9 = bitcast i8* %8 to i32* - %10 = tail call noalias i8* @malloc(i64 %5) #2 - %11 = bitcast i8* %10 to i32* - %12 = icmp sgt i32 %4, 0 - br i1 %12, label %.lr.ph, label %vectorAdd.exit - -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %13 = tail call i32 @rand() #2 - %14 = srem i32 %13, 1000 - %15 = getelementptr inbounds i32* %7, i64 %indvars.iv - store i32 %14, i32* %15, align 4, !tbaa !0 - %16 = tail call i32 @rand() #2 - %17 = srem i32 %16, 1000 - %18 = getelementptr inbounds i32* %9, i64 %indvars.iv - store i32 %17, i32* %18, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %4 - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph - br i1 %12, label %.lr.ph.i, label %vectorAdd.exit - -.lr.ph.i: ; preds = %._crit_edge - %cnt.cast.i = and i64 %3, 4294967295 - %n.vec.i = and i64 %3, 4294967288 - %cmp.zero.i = icmp eq i64 %n.vec.i, 0 - %19 = add i64 %3, 4294967295 - %20 = and i64 %19, 4294967295 - %scevgep.i = getelementptr i32* %11, i64 %20 - br i1 %cmp.zero.i, label %middle.block.i, label %vector.memcheck.i - -vector.memcheck.i: ; preds = %.lr.ph.i - %scevgep8.i = getelementptr i32* %9, i64 %20 - %scevgep5.i = getelementptr i32* %7, i64 %20 - %bound111.i = icmp uge i32* %scevgep.i, %9 - %bound010.i = icmp uge i32* %scevgep8.i, %11 - %bound1.i = icmp uge i32* %scevgep.i, %7 - %bound0.i = icmp uge i32* %scevgep5.i, %11 - %found.conflict12.i = and i1 %bound010.i, %bound111.i - %found.conflict.i = and i1 %bound0.i, %bound1.i - %conflict.rdx.i = or i1 %found.conflict.i, %found.conflict12.i - br i1 %conflict.rdx.i, label %middle.block.i, label %vector.body.i - -vector.body.i: ; preds = %vector.memcheck.i, %vector.body.i - %index.i = phi i64 [ %index.next.i, %vector.body.i ], [ 0, %vector.memcheck.i ] - %21 = getelementptr inbounds i32* %7, i64 %index.i - %22 = bitcast i32* %21 to <4 x i32>* - %wide.load.i = load <4 x i32>* %22, align 4 - %.sum21.i = or i64 %index.i, 4 - %23 = getelementptr i32* %7, i64 %.sum21.i - %24 = bitcast i32* %23 to <4 x i32>* - %wide.load14.i = load <4 x i32>* %24, align 4 - %25 = getelementptr inbounds i32* %9, i64 %index.i - %26 = bitcast i32* %25 to <4 x i32>* - %wide.load15.i = load <4 x i32>* %26, align 4 - %27 = getelementptr i32* %9, i64 %.sum21.i - %28 = bitcast i32* %27 to <4 x i32>* - %wide.load16.i = load <4 x i32>* %28, align 4 - %29 = add nsw <4 x i32> %wide.load15.i, %wide.load.i - %30 = add nsw <4 x i32> %wide.load16.i, %wide.load14.i - %31 = getelementptr inbounds i32* %11, i64 %index.i - %32 = bitcast i32* %31 to <4 x i32>* - store <4 x i32> %29, <4 x i32>* %32, align 4 - %33 = getelementptr i32* %11, i64 %.sum21.i - %34 = bitcast i32* %33 to <4 x i32>* - store <4 x i32> %30, <4 x i32>* %34, align 4 - %index.next.i = add i64 %index.i, 8 - %35 = icmp eq i64 %index.next.i, %n.vec.i - br i1 %35, label %middle.block.i, label %vector.body.i - -middle.block.i: ; preds = %vector.body.i, %vector.memcheck.i, %.lr.ph.i - %resume.val.i = phi i64 [ 0, %.lr.ph.i ], [ 0, %vector.memcheck.i ], [ %n.vec.i, %vector.body.i ] - %cmp.n.i = icmp eq i64 %cnt.cast.i, %resume.val.i - br i1 %cmp.n.i, label %vectorAdd.exit, label %scalar.ph.i - -scalar.ph.i: ; preds = %middle.block.i, %scalar.ph.i - %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %scalar.ph.i ], [ %resume.val.i, %middle.block.i ] - %36 = getelementptr inbounds i32* %7, i64 %indvars.iv.i - %37 = load i32* %36, align 4, !tbaa !0 - %38 = getelementptr inbounds i32* %9, i64 %indvars.iv.i - %39 = load i32* %38, align 4, !tbaa !0 - %40 = add nsw i32 %39, %37 - %41 = getelementptr inbounds i32* %11, i64 %indvars.iv.i - store i32 %40, i32* %41, align 4, !tbaa !0 - %indvars.iv.next.i = add i64 %indvars.iv.i, 1 - %lftr.wideiv.i = trunc i64 %indvars.iv.next.i to i32 - %exitcond.i = icmp eq i32 %lftr.wideiv.i, %4 - br i1 %exitcond.i, label %vectorAdd.exit, label %scalar.ph.i, !llvm.vectorizer.already_vectorized !3 - -vectorAdd.exit: ; preds = %0, %scalar.ph.i, %._crit_edge, %middle.block.i - tail call void @free(i8* %6) #2 - tail call void @free(i8* %8) #2 - tail call void @free(i8* %10) #2 - ret i32 0 -} - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -; Function Attrs: nounwind -declare i64 @strtol(i8*, i8** nocapture, i32) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!0 = metadata !{metadata !"int", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{} -!4 = metadata !{metadata !"any pointer", metadata !1} diff --git a/hpvm/test/vectorAdd/visc_vecadd.ll b/hpvm/test/vectorAdd/visc_vecadd.ll deleted file mode 100644 index 55a2c030b706d7ba8277bce9434acfdd8622dfa7..0000000000000000000000000000000000000000 --- a/hpvm/test/vectorAdd/visc_vecadd.ll +++ /dev/null @@ -1,129 +0,0 @@ -; ModuleID = 'vecadd.c' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind uwtable -%vecadd_ty = struct { i32* } -//TODO -define void @vecadd(i32* %adata, i32* %bdata, i32* %cdata, %vecadd_ty* %out) { -entry: - %node = call i8* @llvm.visc.getNode() - %idx = call i32 @llvm.VISC.getNodeInstanceID(%node) - - %idxprom = sext i32 %idx to i64 - %arrayidxa = getelementptr inbounds i32* %adata, i64 %idxprom - %a = load i32* %arrayidxa - %arrayidxb = getelementptr inbounds i32* %bdata, i64 %idxprom - %b = load i32* %arrayidxb - - %add = add nsw i32 %a, %b - %arrayidxc = getelementptr inbounds i32* %cdata, i64 %idxprom - store i32 %add, i32* %arrayidxc, align 4 - - %ptr = getelementptr inbounds %vecadd_ty* %out, i64 0, i64 0 - store i32* %cdata, %ptr - ret void -} - -define void @vecaddWrapper(i32 %n, i32* %adata, i32* %bdata, i32* %cdata) { - - %wrapper_node = call i8* @llvm.visc.getNode() - %kernel_node = call i8* @llvm.visc.createNode1D(@vecadd,%n) - - %edge0 = call i8* @llvm.visc.createEdge(%wrapper_node, %kernel_node, @edgemap0, @argmap0) - %edge1 = call i8* @llvm.visc.createEdge(%kernel_node, %wrapper_node, @edgemap1, @argmap1) - -} - -; Function Attrs: nounwind uwtable -define i32 @main(i32 %argc, i8** nocapture %argv) #0 { -; Read input size - %1 = getelementptr inbounds i8** %argv, i64 1 - %2 = load i8** %1, align 8, !tbaa !4 - %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #2 - %4 = trunc i64 %3 to i32 - -; Allocate memory - %sext = shl i64 %3, 32 - %5 = ashr exact i64 %sext, 30 - %6 = tail call noalias i8* @malloc(i64 %5) #2 - %7 = bitcast i8* %6 to i32* - %8 = tail call noalias i8* @malloc(i64 %5) #2 - %9 = bitcast i8* %8 to i32* - %10 = tail call noalias i8* @malloc(i64 %5) #2 - %11 = bitcast i8* %10 to i32* - %12 = icmp sgt i32 %4, 0 - br i1 %12, label %.lr.ph, label %vectorAdd.exit - -; Initialize memory -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %13 = tail call i32 @rand() #2 - %14 = srem i32 %13, 1000 - %15 = getelementptr inbounds i32* %7, i64 %indvars.iv - store i32 %14, i32* %15, align 4, !tbaa !0 - %16 = tail call i32 @rand() #2 - %17 = srem i32 %16, 1000 - %18 = getelementptr inbounds i32* %9, i64 %indvars.iv - store i32 %17, i32* %18, align 4, !tbaa !0 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %4 - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph - br i1 %12, label %.lr.ph.i, label %vectorAdd.exit - -; Entrance point to the dataflow graph -.lr.ph.i: ; preds = %._crit_edge - %DFfuture = call i8* @llvm.visc.launch(@vecaddWrapper,%4,%7,%9,%11) -; Get the result from the DF future. - %19 = bitcast i8* %DFfuture to %vecadd_ty* - %20 = getelementptr %vecadd_ty* %19, i32 0, i32 0 - %21 = load i32** %20 - -; Free allocated memory -vectorAdd.exit: ; preds = %0, %scalar.ph.i, - ; %._crit_edge, - ; %middle.block.i - tail call void @free(i8* %6) #2 - tail call void @free(i8* %8) #2 - tail call void @free(i8* %10) #2 - ret i32 0 -} - -declare i8* @llvm.visc.launch(i8*, ...) - -declare i8* @llvm.visc.getNode() nounwind readnone -declare i32 @llvm.VISC.getNodeInstanceID(i8*) nounwind readnone - -declare i8* @llvm.visc.createNode(i8*) nounwind readnone -declare i8* @llvm.visc.createNode1D(i8*,i32) nounwind readnone - -declare i8* @llvm.visc.createEdge(i8*,i8*,i8*,i8*) nounwind readnone - -; Function Attrs: nounwind -declare noalias i8* @malloc(i64) #1 - -; Function Attrs: nounwind -declare i32 @rand() #1 - -; Function Attrs: nounwind -declare void @free(i8* nocapture) #1 - -; Function Attrs: nounwind -declare i64 @strtol(i8*, i8** nocapture, i32) #1 - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" - "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" - "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" - "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" - "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!0 = metadata !{metadata !"int", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} -!3 = metadata !{} -!4 = metadata !{metadata !"any pointer", metadata !1}