diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll index d0f23ce14ea399adad3a91068980a226b1426a2c..36e772486444c3428b3cafb24b8fb0bd2b2ea4a8 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll +++ b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll @@ -19,7 +19,7 @@ target triple = "x86_64-unknown-linux-gnu" ; Return Type of VISC Compute Matrix Mul %rtype = type {float*, i32} -%struct.arg = type { float*, i32, float*, i32, %rtype } +%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype } ; Function Attrs: nounwind declare i8* @llvm.visc.launch(i8*, i8*) #0 @@ -542,18 +542,65 @@ declare i32 @clReleaseProgram(%struct._cl_program*) #3 declare i32 @clReleaseCommandQueue(%struct._cl_command_queue*) #3 -define %rtype @MatrixMulRoot(float* h_A, i32 bytes_A, float* h_B, i32 bytes_B, i32 WA, i32 WB, i32 HA) { - %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, i32, i32)* @matrixMul to i8*), i32 WB, i32 HA) +; Function Attrs: nounwind uwtable +define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { +entry: + %call = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #2 + %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #2 + %cmp22 = icmp sgt i32 %k, 0 + br i1 %cmp22, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %mul = mul nsw i32 %call1, %k + %0 = sext i32 %mul to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %res.024 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ] + %1 = add nsw i64 %indvars.iv, %0 + %arrayidx = getelementptr inbounds float* %A, i64 %1 + %2 = load float* %arrayidx, align 4, !tbaa !0 + %3 = trunc i64 %indvars.iv to i32 + %mul2 = mul nsw i32 %3, %n + %add3 = add nsw i32 %mul2, %call + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds float* %B, i64 %idxprom4 + %4 = load float* %arrayidx5, align 4, !tbaa !0 + %mul6 = fmul float %2, %4 + %add7 = fadd float %res.024, %mul6 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add7, %for.body ] + %mul8 = mul nsw i32 %call1, %n + %add9 = add nsw i32 %mul8, %call + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds float* %C, i64 %idxprom10 + store float %res.0.lcssa, float* %arrayidx11, align 4, !tbaa !0 + %.fca.0.insert = insertvalue { float*, i32 } undef, float* %C, 0 + %.fca.1.insert = insertvalue { float*, i32 } %.fca.0.insert, i32 %bytes_C, 1 + ret { float*, i32 } %.fca.1.insert +} + +define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) { + %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA) ; Bind Inputs - call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0) - call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1) - call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2) - call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3) - call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4) - call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5) + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k + call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n + call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m ; Bind Outputs - call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0) - call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1) + call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C + call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C ret %rtype zeroinitializer } @@ -607,14 +654,25 @@ randomInit.exit41: ; preds = %for.body.i40 %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1 %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2 %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + store float* %0, float** %in.addr.h_A store i32 4194304, i32* %in.addr.bytes_A store float* %1, float** %in.addr.h_B store i32 4194304, i32* %in.addr.bytes_B + store float* %2, float** %in.addr.h_C + store i32 4194304, i32* %in.addr.bytes_C + store i32 1024, i32* %in.addr.WA + store i32 1024, i32* %in.addr.WB + store i32 1024, i32* %in.addr.HA ; Change type to i8* and VISC Launch call %args = bitcast %struct.arg* %in.addr to i8* - %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32)* @MatrixMulRoot to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args) ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304) ; Wait for result @@ -627,7 +685,7 @@ randomInit.exit41: ; preds = %for.body.i40 ;%2 = extractvalue %rtype %out, 0 %out.bytes_C = extractvalue %rtype %out, 1 - ; -------------------------------- Completed VISC Launch Call Cod -------------------------------- + ; -------------------------------- Completed VISC Launch Call -------------------------------- br label %for.cond4.preheader.i