diff --git a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll index cd698d8e0b97127e5f75b9372017db7b9dd36e28..a767cca31e88c3a2dc4c63f7baf9ec2c94e887f5 100644 --- a/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll +++ b/llvm/test/VISC/parboil/benchmarks/mri-q/src/visc/visc_mri-q.ll @@ -271,7 +271,8 @@ declare noalias i8* @malloc(i64) #1 ; ------------------ VISC Intrinsics ------------------- ; Return Type of VISC computePhiMag and computeQ %rtype = type {} -%struct.arg.phiMag = type <{ float*, i64, float*, i64, float*, i64, i32, %rtype }> +%struct.arg.phiMag = type <{ float*, i64, float*, i64, float*, i64, i32, i64, i64 %rtype }> +%struct.arg.Q = type <{i32, i32, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, %struct.kValues*, i64, i64, i64, %rtype}> ; Function Attrs: nounwind declare i8* @llvm.visc.launch(i8*, i8*) #0 @@ -323,7 +324,7 @@ declare void @llvm.visc.bind.output(i8*, i32, i32) ; ----------------- VISC intrinsics end ------------------ ; Function Attrs: noinline nounwind uwtable -define void @computePhiMag_kernel(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK) #4 { +define %rtype @computePhiMag_kernel(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK) #4 { %1 = alloca float*, align 8 %2 = alloca i64, align 8 %3 = alloca float*, align 8 @@ -345,7 +346,7 @@ define void @computePhiMag_kernel(float* in %phiR, i64 %bytes_phiR, float* in %p ; ------------------------- VISC changes ------------------ ; Replace get_global_id call with call to getNode followed by getNumNodeInstances.x ; Replaced statement -- - ; -- %8 = call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) + ; -- %global_id = call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) %this_node = call i8* @llvm.visc.getNode() %L = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) %LLimit = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node) @@ -354,11 +355,11 @@ define void @computePhiMag_kernel(float* in %phiR, i64 %bytes_phiR, float* in %p %G = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) %tmp = mul i32 %G, %LLimit - %8 = add i32 %tmp, %L + %global_id = add i32 %tmp, %L - ;%8 = call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) + ;%global_id = call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) ; ---------------------- VISC changes End ------------------ - store i32 %8, i32* %indexK, align 4 + store i32 %global_id, i32* %indexK, align 4 %9 = load i32* %indexK, align 4 %10 = load i32* %7, align 4 %11 = icmp slt i32 %9, %10 @@ -392,16 +393,38 @@ define void @computePhiMag_kernel(float* in %phiR, i64 %bytes_phiR, float* in %p br label %34 ; <label>:34 ; preds = %12, %0 - ret void + ret %rtype undef } -define void @computePhiMagi_internal(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK, i64 %DimPhiMagBlock) { - +; ----------------- Compute Phi Mag internal node ---------------- +define %rtype @computePhiMag_internal(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK, i64 %DimPhiMagBlock) { + %kernel = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32)* @computePhiMag_kernel to i8*), i64 %DimPhiMagBlock) + ; Bind Inputs + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); phiR + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_phiR + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); phiI + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_phiI + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); phiMag + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_phiMag + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); numK + ; Bind Outputs + ret %rtype undef } -define void @computePhiMagi_root(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK, i64 %DimPhiMagBlock, i64 %DimPhiMagGrid) { - - +; ----------------- Compute Phi Mag root node ---------------- +define %rtype @computePhiMag_root(float* in %phiR, i64 %bytes_phiR, float* in %phiI, i64 %bytes_phiI, float* out %phiMag, i64 %bytes_phiMag, i32 %numK, i64 %DimPhiMagBlock, i64 %DimPhiMagGrid) { + %kernel = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i64)* @computePhiMagi_internal to i8*), i64 %DimPhiMagGrid) + ; Bind Inputs + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); phiR + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_phiR + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); phiI + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_phiI + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); phiMag + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_phiMag + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); numK + call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); DimPhiMagBlock + ; Bind Outputs + ret %rtype undef } ; Function Attrs: noinline nounwind uwtable @@ -449,12 +472,47 @@ define void @computePhiMag(i32 %numK, float* %phiR, float* %phiI, float* %phiMag %24 = load float** %4, align 8 %25 = load i64* %bytes_phi, align 8 %26 = load i32* %1, align 4 - call void @computePhiMag_kernel(float* %20, i64 %21, float* %22, i64 %23, float* %24, i64 %25, i32 %26) + ; ---------------------------------- Adding VISC Launch Call -------------------------------- + ; Replaced - call void @computePhiMag_kernel(float* %20, i64 %21, float* %22, i64 %23, float* %24, i64 %25, i32 %26) + ; Setting up launch input args + %in.addr = alloca %struct.arg.phiMag + + ; Store arguments + %in.addr.phiR = getelementptr %struct.arg* %in.addr, i32 0, i32 0 + %in.addr.bytes_phiR = getelementptr %struct.arg* %in.addr, i32 0, i32 1 + %in.addr.phiI = getelementptr %struct.arg* %in.addr, i32 0, i32 2 + %in.addr.bytes_phiI = getelementptr %struct.arg* %in.addr, i32 0, i32 3 + %in.addr.phiMag = getelementptr %struct.arg* %in.addr, i32 0, i32 4 + %in.addr.bytes_phiMag = getelementptr %struct.arg* %in.addr, i32 0, i32 5 + %in.addr.numK = getelementptr %struct.arg* %in.addr, i32 0, i32 6 + %in.addr.DimPhiMagBlock = getelementptr %struct.arg* %in.addr, i32 0, i32 7 + %in.addr.DimPhiMagGrid = getelementptr %struct.arg* %in.addr, i32 0, i32 8 + + store float* %20, float** %in.addr.phiR + store i64 %21, i64* %in.addr.bytes_phiR + store float* %22, float** %in.addr.phiI + store i64 %23, i64* %in.addr.bytes_phiI + store float* %24, float** %in.addr.phiMag + store i64 %25, i64* %in.addr.bytes_phiMag + store i32 %26, i32* %in.addr.numK + store i64 256, i64* %in.addr.DimPhiMagBlock + store i64 %16, i64* %in.addr.DimPhiMagGrid + + ; Change type to i8* and VISC launch call to computePhiMag_root + %args = bitcast %struct.arg.phiMag* %in.addr to i8* + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i64, i64)* @computePhiMag_root to i8*), i8* %args) + + ; Wait for result + call void @llvm.visc.wait(i8* %graphID) + + ; Get the result + ; -- Not required as all output is through side effects -- + ; -------------------------------- Completed VISC Launch Call -------------------------------- ret void } ; Function Attrs: noinline nounwind uwtable -define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes_x, float* %y, i64 %bytes_y, float* %z, i64 %bytes_z, float* %Qr, i64 %bytes_Qr, float* %Qi, i64 %bytes_Qi, %struct.kValues* %ck, i64 %bytes_ck) #4 { +define %rtype @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes_x, float* %y, i64 %bytes_y, float* %z, i64 %bytes_z, float* %Qr, i64 %bytes_Qr, float* %Qi, i64 %bytes_Qi, %struct.kValues* %ck, i64 %bytes_ck) #4 { %1 = alloca i32, align 4 %2 = alloca i32, align 4 %3 = alloca float*, align 8 @@ -508,9 +566,19 @@ define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes br i1 %17, label %18, label %69 ; <label>:18 ; preds = %15 - %19 = call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) + ; ------------------------- VISC changes ------------------ + ; Replace get_group_id / get_local_id calls with calls to Graph query intrinsics + ; Replaced statement -- + ; %19 = call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) + ; %21 = call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) + %this_node = call i8* @llvm.visc.getNode() + %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) + + %19 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) + %21 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) + ; ---------------------- VISC changes End ------------------ + %20 = mul nsw i32 %19, 256 - %21 = call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) %22 = mul nsw i32 4, %21 %23 = add nsw i32 %20, %22 %24 = load i32* %tx, align 4 @@ -702,9 +770,21 @@ define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes br i1 %165, label %166, label %193 ; <label>:166 ; preds = %163 - %167 = call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) + ; ------------------------- VISC changes ------------------ + ; Replace get_group_id / get_local_id calls with calls to Graph query intrinsics + ; Replaced statement -- + ; %167 = call i32 (i32, ...)* bitcast (i32 (...)* @get_group_id to i32 (i32, ...)*)(i32 0) + ; %169 = call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) + %this_node = call i8* @llvm.visc.getNode() + %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node) + + %167 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node) + %169 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node) + ; ---------------------- VISC changes End ------------------ + + + %168 = mul nsw i32 %167, 256 - %169 = call i32 (i32, ...)* bitcast (i32 (...)* @get_local_id to i32 (i32, ...)*)(i32 0) %170 = mul nsw i32 4, %169 %171 = add nsw i32 %168, %170 %172 = load i32* %tx2, align 4 @@ -737,19 +817,62 @@ define void @computeQ_kernel(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes br label %163 ; <label>:193 ; preds = %163 - ret void + ret %rtype undef } -declare i32 @get_group_id(...) #1 - -declare i32 @get_local_id(...) #1 - ; Function Attrs: nounwind readnone declare double @cos(double) #5 ; Function Attrs: nounwind readnone declare double @sin(double) #5 + +; ----------------- Compute Q internal node ---------------- +define %rtype @computeQ_internal(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes_x, float* %y, i64 %bytes_y, float* %z, i64 %bytes_z, float* %Qr, i64 %bytes_Qr, float* %Qi, i64 %bytes_Qi, %struct.kValues* %ck, i64 %bytes_ck, i64 %DimQBlock) #4 { + %kernel = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32, i32, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, %struct.kValues*, i64)* @computeQ_kernel to i8*), i64 %DimQBlock) + ; Bind Inputs + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); numK + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); kGlobalIndex + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); x + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_x + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); y + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_y + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); z + call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); bytes_z + call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); Qr + call void @llvm.visc.bind.input(i8* %kernel, i32 9, i32 9); bytes_Qr + call void @llvm.visc.bind.input(i8* %kernel, i32 10, i32 10); Qi + call void @llvm.visc.bind.input(i8* %kernel, i32 11, i32 11); bytes_i + call void @llvm.visc.bind.input(i8* %kernel, i32 12, i32 12); ck + call void @llvm.visc.bind.input(i8* %kernel, i32 13, i32 13); bytes_ck + ; Bind Outputs + ret %rtype undef + +} + +; ----------------- Compute Q root node ---------------- +define %rtype @computeQ_root(i32 %numK, i32 %kGlobalIndex, float* %x, i64 %bytes_x, float* %y, i64 %bytes_y, float* %z, i64 %bytes_z, float* %Qr, i64 %bytes_Qr, float* %Qi, i64 %bytes_Qi, %struct.kValues* %ck, i64 %bytes_ck, i64 %DimQBlock, i64 %DimQGrid) #4 { + %kernel = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32, i32, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, %struct.kValues*, i64, i64)* @computeQ_internal to i8*), i64 %DimQGrid) + ; Bind Inputs + call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); numK + call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); kGlobalIndex + call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); x + call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_x + call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); y + call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_y + call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); z + call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); bytes_z + call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); Qr + call void @llvm.visc.bind.input(i8* %kernel, i32 9, i32 9); bytes_Qr + call void @llvm.visc.bind.input(i8* %kernel, i32 10, i32 10); Qi + call void @llvm.visc.bind.input(i8* %kernel, i32 11, i32 11); bytes_i + call void @llvm.visc.bind.input(i8* %kernel, i32 12, i32 12); ck + call void @llvm.visc.bind.input(i8* %kernel, i32 13, i32 13); bytes_ck + call void @llvm.visc.bind.input(i8* %kernel, i32 14, i32 14); DimQBlock + ; Bind Outputs + ret %rtype undef +} + ; Function Attrs: noinline nounwind uwtable define void @computeQ(i32 %numK, i32 %numX, float* %x, float* %y, float* %z, %struct.kValues* %kVals, float* %Qr, float* %Qi) #4 { %1 = alloca i32, align 4