diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl new file mode 100644 index 0000000000000000000000000000000000000000..3f34ea5ef25943ac7eeb18eead429f70ec4cf807 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl @@ -0,0 +1,176 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef LBM_KERNEL_CL +#define LBM_KERNEL_CL + +#include "layout_config.h" +#include "lbm_macros.h" +/******************************************************************************/ + +__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid ) +{ + srcGrid += MARGIN; + dstGrid += MARGIN; + + + //Using some predefined macros here. Consider this the declaration + // and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z + + SWEEP_VAR + SWEEP_X = get_local_id(0); + SWEEP_Y = get_group_id(0); + SWEEP_Z = get_group_id(1); + + float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB; + float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ; + float tempSB, tempET, tempEB, tempWT, tempWB ; + + //Load all of the input fields + //This is a gather operation of the SCATTER preprocessor variable + // is undefined in layout_config.h, or a "local" read otherwise + tempC = SRC_C(srcGrid); + + tempN = SRC_N(srcGrid); + tempS = SRC_S(srcGrid); + tempE = SRC_E(srcGrid); + tempW = SRC_W(srcGrid); + tempT = SRC_T(srcGrid); + tempB = SRC_B(srcGrid); + + tempNE = SRC_NE(srcGrid); + tempNW = SRC_NW(srcGrid); + tempSE = SRC_SE(srcGrid); + tempSW = SRC_SW(srcGrid); + tempNT = SRC_NT(srcGrid); + tempNB = SRC_NB(srcGrid); + tempST = SRC_ST(srcGrid); + tempSB = SRC_SB(srcGrid); + tempET = SRC_ET(srcGrid); + tempEB = SRC_EB(srcGrid); + tempWT = SRC_WT(srcGrid); + tempWB = SRC_WB(srcGrid); + + //Test whether the cell is fluid or obstacle + if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) { + + //Swizzle the inputs: reflect any fluid coming into this cell + // back to where it came from + temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ; + temp_swp = tempE ; tempE = tempW ; tempW = temp_swp; + temp_swp = tempT ; tempT = tempB ; tempB = temp_swp; + temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp; + temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp; + temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; + temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp; + temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp; + temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp; + } + else { + + //The math meat of LBM: ignore for optimization + float ux, uy, uz, rho, u2; + float temp1, temp2, temp_base; + rho = tempC + tempN + + tempS + tempE + + tempW + tempT + + tempB + tempNE + + tempNW + tempSE + + tempSW + tempNT + + tempNB + tempST + + tempSB + tempET + + tempEB + tempWT + + tempWB; + + ux = + tempE - tempW + + tempNE - tempNW + + tempSE - tempSW + + tempET + tempEB + - tempWT - tempWB; + + uy = + tempN - tempS + + tempNE + tempNW + - tempSE - tempSW + + tempNT + tempNB + - tempST - tempSB; + + uz = + tempT - tempB + + tempNT - tempNB + + tempST - tempSB + + tempET - tempEB + + tempWT - tempWB; + + ux /= rho; + uy /= rho; + uz /= rho; + + if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) { + + ux = 0.005f; + uy = 0.002f; + uz = 0.000f; + } + + u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f; + temp_base = OMEGA*rho; + temp1 = DFL1*temp_base; + + //Put the output values for this cell in the shared memory + temp_base = OMEGA*rho; + temp1 = DFL1*temp_base; + temp2 = 1.0f-OMEGA; + tempC = temp2*tempC + temp1*( - u2); + temp1 = DFL2*temp_base; + tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2); + tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2); + tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2); + tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2); + tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2); + tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2); + temp1 = DFL3*temp_base; + tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2); + tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2); + tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2); + tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2); + tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2); + tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2); + tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2); + tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2); + tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2); + tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2); + tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2); + tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2); + } + + //Write the results computed above + //This is a scatter operation of the SCATTER preprocessor variable + // is defined in layout_config.h, or a "local" write otherwise + DST_C ( dstGrid ) = tempC; + + DST_N ( dstGrid ) = tempN; + DST_S ( dstGrid ) = tempS; + DST_E ( dstGrid ) = tempE; + DST_W ( dstGrid ) = tempW; + DST_T ( dstGrid ) = tempT; + DST_B ( dstGrid ) = tempB; + + DST_NE( dstGrid ) = tempNE; + DST_NW( dstGrid ) = tempNW; + DST_SE( dstGrid ) = tempSE; + DST_SW( dstGrid ) = tempSW; + DST_NT( dstGrid ) = tempNT; + DST_NB( dstGrid ) = tempNB; + DST_ST( dstGrid ) = tempST; + DST_SB( dstGrid ) = tempSB; + DST_ET( dstGrid ) = tempET; + DST_EB( dstGrid ) = tempEB; + DST_WT( dstGrid ) = tempWT; + DST_WB( dstGrid ) = tempWB; +} + +#endif // LBM_KERNEL_CL diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s new file mode 100644 index 0000000000000000000000000000000000000000..7e130799aa9f47a3866d08596d832f5954c658ad --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s @@ -0,0 +1,261 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 3.1 +.target sm_20, texmode_independent +.address_size 32 + + // .globl performStreamCollide_kernel + +.entry performStreamCollide_kernel( + .param .u32 .ptr .global .align 4 performStreamCollide_kernel_param_0, + .param .u32 .ptr .global .align 4 performStreamCollide_kernel_param_1 +) +{ + .reg .pred %p<3>; + .reg .f32 %f<197>; + .reg .s32 %r<38>; + + ld.param.u32 %r5, [performStreamCollide_kernel_param_0]; + mov.u32 %r6, %tid.x; + ld.param.u32 %r4, [performStreamCollide_kernel_param_1]; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %ctaid.y; + shl.b32 %r9, %r7, 7; + add.s32 %r10, %r9, %r6; + mad.lo.s32 %r1, %r8, 15360, %r10; + add.s32 %r2, %r1, 30720; + shl.b32 %r11, %r2, 2; + add.s32 %r12, %r5, %r11; + ld.global.f32 %f1, [%r12]; + shl.b32 %r13, %r1, 2; + add.s32 %r14, %r13, %r5; + ld.global.f32 %f2, [%r14+9584128]; + ld.global.f32 %f3, [%r14+19046912]; + add.s32 %r15, %r10, -1; + mad.lo.s32 %r16, %r8, 15360, %r15; + shl.b32 %r17, %r16, 2; + add.s32 %r18, %r17, %r5; + ld.global.f32 %f4, [%r18+28508160]; + add.s32 %r19, %r10, 1; + mad.lo.s32 %r20, %r8, 15360, %r19; + shl.b32 %r21, %r20, 2; + add.s32 %r22, %r21, %r5; + ld.global.f32 %f5, [%r22+37969920]; + ld.global.f32 %f6, [%r14+47370240]; + mad.lo.s32 %r23, %r8, 15360, 15360; + add.s32 %r24, %r10, %r23; + shl.b32 %r25, %r24, 2; + add.s32 %r26, %r25, %r5; + ld.global.f32 %f7, [%r26+56893440]; + ld.global.f32 %f8, [%r14+66354684]; + ld.global.f32 %f9, [%r14+75816452]; + ld.global.f32 %f10, [%r14+85279228]; + ld.global.f32 %f11, [%r14+94740996]; + ld.global.f32 %f12, [%r14+104140288]; + ld.global.f32 %f13, [%r14+113724928]; + ld.global.f32 %f14, [%r14+123064832]; + ld.global.f32 %f15, [%r14+132649472]; + ld.global.f32 %f16, [%r14+141987836]; + add.s32 %r27, %r15, %r23; + shl.b32 %r28, %r27, 2; + add.s32 %r29, %r28, %r5; + ld.global.f32 %f17, [%r29+151511040]; + ld.global.f32 %f18, [%r14+160911364]; + add.s32 %r30, %r19, %r23; + shl.b32 %r31, %r30, 2; + add.s32 %r32, %r31, %r5; + ld.global.f32 %f19, [%r32+170434560]; + ld.global.u32 %r3, [%r14+179896320]; + { + .reg .b32 temp; + and.b32 temp, %r3, 1; + setp.b32.eq %p1, temp, 1; + } + @%p1 bra BB0_2; + add.f32 %f58, %f1, %f2; + add.f32 %f59, %f58, %f3; + add.f32 %f60, %f59, %f4; + add.f32 %f61, %f60, %f5; + add.f32 %f62, %f61, %f6; + add.f32 %f63, %f62, %f7; + add.f32 %f64, %f63, %f8; + add.f32 %f65, %f64, %f9; + add.f32 %f66, %f65, %f10; + add.f32 %f67, %f66, %f11; + add.f32 %f68, %f67, %f12; + add.f32 %f69, %f68, %f13; + add.f32 %f70, %f69, %f14; + add.f32 %f71, %f70, %f15; + add.f32 %f72, %f71, %f16; + add.f32 %f73, %f72, %f17; + add.f32 %f74, %f73, %f18; + add.f32 %f75, %f74, %f19; + sub.f32 %f76, %f4, %f5; + add.f32 %f77, %f76, %f8; + sub.f32 %f78, %f77, %f9; + add.f32 %f79, %f78, %f10; + sub.f32 %f80, %f79, %f11; + add.f32 %f81, %f80, %f16; + add.f32 %f82, %f81, %f17; + sub.f32 %f83, %f82, %f18; + sub.f32 %f84, %f83, %f19; + sub.f32 %f85, %f2, %f3; + add.f32 %f86, %f85, %f8; + add.f32 %f87, %f86, %f9; + sub.f32 %f88, %f87, %f10; + sub.f32 %f89, %f88, %f11; + add.f32 %f90, %f89, %f12; + add.f32 %f91, %f90, %f13; + sub.f32 %f92, %f91, %f14; + sub.f32 %f93, %f92, %f15; + sub.f32 %f94, %f6, %f7; + add.f32 %f95, %f94, %f12; + sub.f32 %f96, %f95, %f13; + add.f32 %f97, %f96, %f14; + sub.f32 %f98, %f97, %f15; + add.f32 %f99, %f98, %f16; + sub.f32 %f100, %f99, %f17; + add.f32 %f101, %f100, %f18; + sub.f32 %f102, %f101, %f19; + div.rn.f32 %f103, %f84, %f75; + div.rn.f32 %f104, %f93, %f75; + div.rn.f32 %f105, %f102, %f75; + and.b32 %r33, %r3, 2; + setp.eq.s32 %p2, %r33, 0; + selp.f32 %f106, %f103, 0f3BA3D70A, %p2; + selp.f32 %f107, %f104, 0f3B03126F, %p2; + selp.f32 %f108, %f105, 0f00000000, %p2; + mul.f32 %f109, %f107, %f107; + fma.rn.f32 %f110, %f106, %f106, %f109; + fma.rn.f32 %f111, %f108, %f108, %f110; + fma.rn.f32 %f112, %f111, 0f3FC00000, 0fBF800000; + mul.f32 %f113, %f75, 0f3FF9999A; + mul.f32 %f114, %f113, 0fBEAAAAAB; + mul.f32 %f115, %f114, %f112; + fma.rn.f32 %f20, %f1, 0fBF733334, %f115; + mul.f32 %f116, %f113, 0f3D638E39; + fma.rn.f32 %f117, %f107, 0f40900000, 0f40400000; + neg.f32 %f118, %f112; + fma.rn.f32 %f119, %f107, %f117, %f118; + mul.f32 %f120, %f116, %f119; + fma.rn.f32 %f21, %f2, 0fBF733334, %f120; + fma.rn.f32 %f121, %f107, 0f40900000, 0fC0400000; + fma.rn.f32 %f122, %f107, %f121, %f118; + mul.f32 %f123, %f116, %f122; + fma.rn.f32 %f2, %f3, 0fBF733334, %f123; + fma.rn.f32 %f124, %f108, 0f40900000, 0f40400000; + fma.rn.f32 %f125, %f108, %f124, %f118; + mul.f32 %f126, %f116, %f125; + fma.rn.f32 %f23, %f6, 0fBF733334, %f126; + fma.rn.f32 %f127, %f108, 0f40900000, 0fC0400000; + fma.rn.f32 %f128, %f108, %f127, %f118; + mul.f32 %f129, %f116, %f128; + fma.rn.f32 %f6, %f7, 0fBF733334, %f129; + fma.rn.f32 %f130, %f106, 0f40900000, 0f40400000; + fma.rn.f32 %f131, %f106, %f130, %f118; + mul.f32 %f132, %f116, %f131; + fma.rn.f32 %f25, %f4, 0fBF733334, %f132; + fma.rn.f32 %f133, %f106, 0f40900000, 0fC0400000; + fma.rn.f32 %f134, %f106, %f133, %f118; + mul.f32 %f135, %f116, %f134; + fma.rn.f32 %f4, %f5, 0fBF733334, %f135; + mul.f32 %f136, %f113, 0f3CE38E39; + add.f32 %f137, %f107, %f108; + fma.rn.f32 %f138, %f137, 0f40900000, 0f40400000; + fma.rn.f32 %f139, %f137, %f138, %f118; + mul.f32 %f140, %f136, %f139; + fma.rn.f32 %f27, %f12, 0fBF733334, %f140; + sub.f32 %f141, %f107, %f108; + fma.rn.f32 %f142, %f141, 0f40900000, 0f40400000; + fma.rn.f32 %f143, %f141, %f142, %f118; + mul.f32 %f144, %f136, %f143; + fma.rn.f32 %f28, %f13, 0fBF733334, %f144; + neg.f32 %f145, %f107; + sub.f32 %f146, %f108, %f107; + fma.rn.f32 %f147, %f146, 0f40900000, 0f40400000; + fma.rn.f32 %f148, %f146, %f147, %f118; + mul.f32 %f149, %f136, %f148; + fma.rn.f32 %f13, %f14, 0fBF733334, %f149; + sub.f32 %f150, %f145, %f108; + fma.rn.f32 %f151, %f150, 0f40900000, 0f40400000; + fma.rn.f32 %f152, %f150, %f151, %f118; + mul.f32 %f153, %f136, %f152; + fma.rn.f32 %f12, %f15, 0fBF733334, %f153; + add.f32 %f154, %f106, %f107; + fma.rn.f32 %f155, %f154, 0f40900000, 0f40400000; + fma.rn.f32 %f156, %f154, %f155, %f118; + mul.f32 %f157, %f136, %f156; + fma.rn.f32 %f31, %f8, 0fBF733334, %f157; + sub.f32 %f158, %f106, %f107; + fma.rn.f32 %f159, %f158, 0f40900000, 0f40400000; + fma.rn.f32 %f160, %f158, %f159, %f118; + mul.f32 %f161, %f136, %f160; + fma.rn.f32 %f32, %f10, 0fBF733334, %f161; + add.f32 %f162, %f106, %f108; + fma.rn.f32 %f163, %f162, 0f40900000, 0f40400000; + fma.rn.f32 %f164, %f162, %f163, %f118; + mul.f32 %f165, %f136, %f164; + fma.rn.f32 %f33, %f16, 0fBF733334, %f165; + sub.f32 %f166, %f106, %f108; + fma.rn.f32 %f167, %f166, 0f40900000, 0f40400000; + fma.rn.f32 %f168, %f166, %f167, %f118; + mul.f32 %f169, %f136, %f168; + fma.rn.f32 %f34, %f17, 0fBF733334, %f169; + neg.f32 %f170, %f106; + sub.f32 %f171, %f107, %f106; + fma.rn.f32 %f172, %f171, 0f40900000, 0f40400000; + fma.rn.f32 %f173, %f171, %f172, %f118; + mul.f32 %f174, %f136, %f173; + fma.rn.f32 %f10, %f9, 0fBF733334, %f174; + sub.f32 %f175, %f170, %f107; + fma.rn.f32 %f176, %f175, 0f40900000, 0f40400000; + fma.rn.f32 %f177, %f175, %f176, %f118; + mul.f32 %f178, %f136, %f177; + fma.rn.f32 %f8, %f11, 0fBF733334, %f178; + sub.f32 %f179, %f108, %f106; + fma.rn.f32 %f180, %f179, 0f40900000, 0f40400000; + fma.rn.f32 %f181, %f179, %f180, %f118; + mul.f32 %f182, %f136, %f181; + fma.rn.f32 %f17, %f18, 0fBF733334, %f182; + sub.f32 %f183, %f170, %f108; + fma.rn.f32 %f184, %f183, 0f40900000, 0f40400000; + fma.rn.f32 %f185, %f183, %f184, %f118; + mul.f32 %f186, %f136, %f185; + fma.rn.f32 %f16, %f19, 0fBF733334, %f186; + mov.f32 %f5, %f25; + mov.f32 %f7, %f23; + mov.f32 %f11, %f31; + mov.f32 %f9, %f32; + mov.f32 %f15, %f27; + mov.f32 %f14, %f28; + mov.f32 %f19, %f33; + mov.f32 %f18, %f34; + mov.f32 %f3, %f21; + mov.f32 %f1, %f20; +BB0_2: + add.s32 %r35, %r4, %r11; + st.global.f32 [%r35], %f1; + add.s32 %r37, %r13, %r4; + st.global.f32 [%r37+9584640], %f3; + st.global.f32 [%r37+19046400], %f2; + st.global.f32 [%r37+28508160], %f5; + st.global.f32 [%r37+37969920], %f4; + st.global.f32 [%r37+47431680], %f7; + st.global.f32 [%r37+56893440], %f6; + st.global.f32 [%r37+66355200], %f11; + st.global.f32 [%r37+75816960], %f10; + st.global.f32 [%r37+85278720], %f9; + st.global.f32 [%r37+94740480], %f8; + st.global.f32 [%r37+104202240], %f15; + st.global.f32 [%r37+113664000], %f14; + st.global.f32 [%r37+123125760], %f13; + st.global.f32 [%r37+132587520], %f12; + st.global.f32 [%r37+142049280], %f19; + st.global.f32 [%r37+151511040], %f18; + st.global.f32 [%r37+160972800], %f17; + st.global.f32 [%r37+170434560], %f16; + ret; +} + diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c index 433a17c29949213d0a47c3e725ab79b4dd6392e1..128acb0298e093c7315366ff231a95e47de95154 100644 --- a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c +++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c @@ -28,205 +28,207 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid; struct pb_TimerSet timers; int main( int nArgs, char* arg[] ) { - MAIN_Param param; - int t; - - OpenCL_Param prm; - - pb_InitializeTimerSet(&timers); - struct pb_Parameters* params; - params = pb_ReadParameters(&nArgs, arg); - - - static LBM_GridPtr TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - MAIN_parseCommandLine( nArgs, arg, ¶m, params ); - MAIN_printInfo( ¶m ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_initialize(&prm); - MAIN_initialize( ¶m, &prm ); - - for( t = 1; t <= param.nTimeSteps; t++ ) { - pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); - - if( (t & 63) == 0 ) { - printf( "timestep: %i\n", t ); + MAIN_Param param; + int t; + + OpenCL_Param prm; + + pb_InitializeTimerSet(&timers); + struct pb_Parameters* params; + params = pb_ReadParameters(&nArgs, arg); + + + static LBM_GridPtr TEMP_srcGrid; + //Setup TEMP datastructures + LBM_allocateGrid( (float**) &TEMP_srcGrid ); + MAIN_parseCommandLine( nArgs, arg, ¶m, params ); + MAIN_printInfo( ¶m ); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_initialize(&prm); + MAIN_initialize( ¶m, &prm ); + + for( t = 1; t <= param.nTimeSteps; t++ ) { + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid ); + + if( (t & 63) == 0 ) { + printf( "timestep: %i\n", t ); #if 0 - CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); - LBM_showGridStatistics( *TEMP_srcGrid ); + CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid); + LBM_showGridStatistics( *TEMP_srcGrid ); #endif - } - } - - MAIN_finalize( ¶m, &prm ); + } + } - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + MAIN_finalize( ¶m, &prm ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_freeGrid( (float**) &TEMP_srcGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_NONE); - pb_PrintTimerSet(&timers); - pb_FreeParameters(params); - return 0; + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + return 0; } /*############################################################################*/ void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) { - struct stat fileStat; - - if( nArgs < 2 ) { - printf( "syntax: lbm <time steps>\n" ); - exit( 1 ); - } - - param->nTimeSteps = atoi( arg[1] ); - - if( params->inpFiles[0] != NULL ) { - param->obstacleFilename = params->inpFiles[0]; - - if( stat( param->obstacleFilename, &fileStat ) != 0 ) { - printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", - param->obstacleFilename ); - exit( 1 ); - } - if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { - printf( "MAIN_parseCommandLine:\n" - "\tsize of file '%s' is %i bytes\n" - "\texpected size is %i bytes\n", - param->obstacleFilename, (int) fileStat.st_size, - SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); - exit( 1 ); - } - } - else param->obstacleFilename = NULL; - - param->resultFilename = params->outFile; + struct stat fileStat; + + if( nArgs < 2 ) { + printf( "syntax: lbm <time steps>\n" ); + exit( 1 ); + } + + param->nTimeSteps = atoi( arg[1] ); + + if( params->inpFiles[0] != NULL ) { + param->obstacleFilename = params->inpFiles[0]; + + if( stat( param->obstacleFilename, &fileStat ) != 0 ) { + printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n", + param->obstacleFilename ); + exit( 1 ); + } + if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) { + printf( "MAIN_parseCommandLine:\n" + "\tsize of file '%s' is %i bytes\n" + "\texpected size is %i bytes\n", + param->obstacleFilename, (int) fileStat.st_size, + SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ); + exit( 1 ); + } + } + else param->obstacleFilename = NULL; + + param->resultFilename = params->outFile; } /*############################################################################*/ void MAIN_printInfo( const MAIN_Param* param ) { - printf( "MAIN_printInfo:\n" - "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" - "\tnTimeSteps : %i\n" - "\tresult file : %s\n" - "\taction : %s\n" - "\tsimulation type: %s\n" - "\tobstacle file : %s\n\n", - SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, - param->nTimeSteps, param->resultFilename, - "store", "lid-driven cavity", - (param->obstacleFilename == NULL) ? "<none>" : - param->obstacleFilename ); + printf( "MAIN_printInfo:\n" + "\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n" + "\tnTimeSteps : %i\n" + "\tresult file : %s\n" + "\taction : %s\n" + "\tsimulation type: %s\n" + "\tobstacle file : %s\n\n", + SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z, + param->nTimeSteps, param->resultFilename, + "store", "lid-driven cavity", + (param->obstacleFilename == NULL) ? "<none>" : + param->obstacleFilename ); } /*############################################################################*/ void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) { - static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); - LBM_allocateGrid( (float**) &TEMP_dstGrid ); - LBM_initializeGrid( TEMP_srcGrid ); - LBM_initializeGrid( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_IO); - if( param->obstacleFilename != NULL ) { - LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); - LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); - } - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); - LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - //Setup DEVICE datastructures - OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); - OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); - - //Initialize DEVICE datastructures - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); - OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); - - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); - - LBM_freeGrid( (float**) &TEMP_srcGrid ); - LBM_freeGrid( (float**) &TEMP_dstGrid ); + static LBM_Grid TEMP_srcGrid, TEMP_dstGrid; + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + //Setup TEMP datastructures + LBM_allocateGrid( (float**) &TEMP_srcGrid ); + LBM_allocateGrid( (float**) &TEMP_dstGrid ); + LBM_initializeGrid( TEMP_srcGrid ); + LBM_initializeGrid( TEMP_dstGrid ); + + pb_SwitchToTimer(&timers, pb_TimerID_IO); + if( param->obstacleFilename != NULL ) { + LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename ); + LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename ); + } + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_initializeSpecialCellsForLDC( TEMP_srcGrid ); + LBM_initializeSpecialCellsForLDC( TEMP_dstGrid ); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + //Setup DEVICE datastructures + OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid ); + OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid ); + + //Initialize DEVICE datastructures + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid ); + OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid ); + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics( TEMP_srcGrid ); + + LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid( (float**) &TEMP_dstGrid ); } /*############################################################################*/ void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) { - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_Grid TEMP_srcGrid; + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_Grid TEMP_srcGrid; - //Setup TEMP datastructures - LBM_allocateGrid( (float**) &TEMP_srcGrid ); + //Setup TEMP datastructures + LBM_allocateGrid( (float**) &TEMP_srcGrid ); - pb_SwitchToTimer(&timers, pb_TimerID_COPY); - OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); - pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - LBM_showGridStatistics( TEMP_srcGrid ); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + LBM_showGridStatistics( TEMP_srcGrid ); - LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); + LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE ); - LBM_freeGrid( (float**) &TEMP_srcGrid ); + LBM_freeGrid( (float**) &TEMP_srcGrid ); - pb_SwitchToTimer(&timers, visc_TimerID_SETUP); - OpenCL_LBM_freeGrid( OpenCL_srcGrid ); - OpenCL_LBM_freeGrid( OpenCL_dstGrid ); + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + OpenCL_LBM_freeGrid( OpenCL_srcGrid ); + OpenCL_LBM_freeGrid( OpenCL_dstGrid ); - clReleaseProgram(prm->clProgram); - clReleaseKernel(prm->clKernel); - clReleaseCommandQueue(prm->clCommandQueue); - clReleaseContext(prm->clContext); + clReleaseProgram(prm->clProgram); + clReleaseKernel(prm->clKernel); + clReleaseCommandQueue(prm->clCommandQueue); + clReleaseContext(prm->clContext); } void OpenCL_initialize(OpenCL_Param* prm) { - cl_int clStatus; - - clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); - CHECK_ERROR("clGetPlatformIDs") + cl_int clStatus; + + clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL); + CHECK_ERROR("clGetPlatformIDs") + + prm->clCps[0] = CL_CONTEXT_PLATFORM; + prm->clCps[1] = (cl_context_properties)(prm->clPlatform); + prm->clCps[2] = 0; + + clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); + CHECK_ERROR("clGetDeviceIDs") + + prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); + CHECK_ERROR("clCreateContextFromType") - prm->clCps[0] = CL_CONTEXT_PLATFORM; - prm->clCps[1] = (cl_context_properties)(prm->clPlatform); - prm->clCps[2] = 0; + prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") - clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL); - CHECK_ERROR("clGetDeviceIDs") + pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); - prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus); - CHECK_ERROR("clCreateContextFromType") + //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; + //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); + //CHECK_ERROR("clCreateProgramWithSource") - prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); - CHECK_ERROR("clCreateCommandQueue") + //char clOptions[100]; + //sprintf(clOptions,"-I src/opencl_nvidia"); - pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue)); + //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); + //CHECK_ERROR("clBuildProgram") - const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")}; - prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus); - CHECK_ERROR("clCreateProgramWithSource") + //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); + //CHECK_ERROR("clCreateKernel") - char clOptions[100]; - sprintf(clOptions,"-I src/opencl_nvidia"); - - clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL); - CHECK_ERROR("clBuildProgram") - - prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") + //free((void*)clSource[0]); - free((void*)clSource[0]); + pb_CreateAndBuildKernelFromBinary("src/opencl_nvidia/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel); }