diff --git a/.gitignore b/.gitignore index 516108ddfe2c09b0e5d556e6b4ca96c8a9b3a72d..6298d7bbc1b72492a0d1da0892a78ff2c3f13b85 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/target +**/target *.dot !paper_resources/*.dot *.bc @@ -14,3 +14,4 @@ .vscode *_env *.txt +*ncu-rep \ No newline at end of file diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; // because Fork basic block's init section already does gating write!( w, - "{}{} = (threadIdx.x % {}) / {};\n", + "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n", tabs, define_variable, use_thread_quota.unwrap(), - use_thread_quota.unwrap() / parallel_factor.unwrap() + use_thread_quota.unwrap() / parallel_factor.unwrap(), + divide, + modulo, )?; } } diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e6db0345def31324243cdee2bdcb6b5cca5d9a7b..8bd3f735fa60cf4e0404edde28eb858e0ed7e581 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Rem, }); edit.sub_edit(tid, rem); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, rem)?; } else if tid_dim == inner_idx { let outer_tid = Node::ThreadID { @@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Div, }); edit.sub_edit(tid, div); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, div)?; } } @@ -1479,7 +1481,12 @@ fn fork_fusion( } // Perform the fusion. + let bottom_tids: Vec<_> = editor + .get_users(bottom_fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); editor.edit(|mut edit| { + edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?; if bottom_join_pred != bottom_fork { // If there is control flow in the bottom fork-join, stitch it into // the top fork-join. diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs index c7ea58360bf09b2a8e221a19f66a9bfe81126516..a01b0f5553ef59aaf45fe6ccc2c3ee7bdc881a0b 100644 --- a/hercules_opt/src/interprocedural_sroa.rs +++ b/hercules_opt/src/interprocedural_sroa.rs @@ -85,7 +85,6 @@ pub fn interprocedural_sroa( param_nodes[idx].push(id); } } - println!("{}", editor.func().name); let success = editor.edit(|mut edit| { for (idx, ids) in param_nodes.into_iter().enumerate() { let new_indices = &old_param_type_map[idx]; diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs index e658ff8818246b31b4dbf3c9dd7ad0372c59f3c6..2718f99d82321d6ddce59320ffc5ea2b9bbdc2cf 100644 --- a/hercules_opt/src/sroa.rs +++ b/hercules_opt/src/sroa.rs @@ -447,7 +447,7 @@ pub fn sroa( field_map.insert(node, generate_reads(editor, types[&node], node)); } Node::Constant { id } => { - field_map.insert(node, generate_constant_fields(editor, id)); + field_map.insert(node, generate_constant_fields(editor, id, node)); to_delete.push(node); } _ => { @@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID // Given a constant cnst adds node to the function which are the constant values of each field and // returns a list of pairs of indices and the node that holds that index -fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> { +fn generate_constant_fields( + editor: &mut FunctionEditor, + cnst: ConstantID, + old_node: NodeID, +) -> IndexTree<NodeID> { let cs: Option<Vec<ConstantID>> = if let Some(cs) = editor.get_constant(cnst).try_product_fields() { Some(cs.into()) @@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In if let Some(cs) = cs { let mut fields = vec![]; for c in cs { - fields.push(generate_constant_fields(editor, c)); + fields.push(generate_constant_fields(editor, c, old_node)); } IndexTree::Node(fields) } else { let mut node = None; editor.edit(|mut edit| { node = Some(edit.add_node(Node::Constant { id: cnst })); + edit.sub_edit(old_node, node.unwrap()); Ok(edit) }); IndexTree::Leaf(node.expect("Add node cannot fail")) diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index 4d02b2cdb5f843774ebae9b873388e03c82071b0..931e78f8196eba77ff3f9015ab7f4561b1153c57 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @image_loop for r = 0 to row { for c = 0 to col { @l2 let l2_dist : f32[num_ctrl_pts]; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { let v1 = input[0, r, c] - ctrl_pts[cp, 0]; let v2 = input[1, r, c] - ctrl_pts[cp, 1]; let v3 = input[2, r, c] - ctrl_pts[cp, 2]; @@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @channel_loop for chan = 0 to CHAN { let chan_val : f32 = 0.0; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { chan_val += l2_dist[cp] * weights[cp, chan]; } diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch index 8f22b37d4c3a77a31e8d2467c8e6e130c281513f..6fc8adbb1fff30856978ce201af09fd2e02d6298 100644 --- a/juno_samples/cava/src/cpu.sch +++ b/juno_samples/cava/src/cpu.sch @@ -115,7 +115,7 @@ array-slf(fuse4); simpl!(fuse4); let par = fuse4@image_loop \ fuse4@channel_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let fuse4_body = outline(split.cava_3.fj2); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bacfd3abca363a6dd93496c1adb23fa54c860b9f..0ef466c00d08f3edffe02bad9612ce23ebf14768 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -117,9 +117,9 @@ fixpoint { simpl!(fuse4); array-slf(fuse4); simpl!(fuse4); -//fork-tile[2, 0, false, true](fuse4@channel_loop); -//fork-split(fuse4@channel_loop); -//clean-monoid-reduces(fuse4); +fork-tile[2, 0, false, true](fuse4@channel_loop); +let out = fork-split(fuse4@channel_loop); +fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); no-memset(fuse5@res1); @@ -133,6 +133,13 @@ simpl!(fuse5); array-slf(fuse5); simpl!(fuse5); +fork-tile[4, 1, false, true](fuse4); +fork-tile[8, 0, false, true](fuse4); +fork-interchange[1, 2](fuse4); +let split = fork-split(fuse4); +fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); +fork-coalesce(split.cava_3.fj2); + delete-uncalled(*); simpl!(*); diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs index 1810a24670d8fa7325b2ff353ba70756824a0269..47e3b1b3542217b9a17905d8a52ef431fd97541f 100644 --- a/juno_samples/cava/src/lib.rs +++ b/juno_samples/cava/src/lib.rs @@ -124,9 +124,9 @@ pub struct CavaInputs { #[clap(long = "output-verify", value_name = "PATH")] pub output_verify: Option<String>, pub cam_model: String, - #[clap(short, long)] + #[clap(long)] pub crop_rows: Option<usize>, - #[clap(short, long)] + #[clap(long)] pub crop_cols: Option<usize>, } diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index ec9e423dc4c160d08b61eaf45d2b75329886f94e..3e1321c517b55f72e529e8a6df46d9d55bf59efb 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -26,14 +26,14 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); simpl!(gaussian_smoothing, gaussian_smoothing_body); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); @@ -42,15 +42,15 @@ fixpoint { simpl!(laplacian_estimate); let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; +let body = split._1_laplacian_estimate.fj2; let laplacian_estimate_body = outline(body); fork-coalesce(laplacian_estimate, laplacian_estimate_body); simpl!(laplacian_estimate, laplacian_estimate_body); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); @@ -59,10 +59,10 @@ fixpoint { simpl!(zero_crossings); let par = zero_crossings@image_loop \ zero_crossings@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; +let body = split._2_zero_crossings.fj2; let zero_crossings_body = outline(body); fork-coalesce(zero_crossings, zero_crossings_body); simpl!(zero_crossings, zero_crossings_body); @@ -86,7 +86,7 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[16, 0, false, false](max_gradient); +fork-tile[32, 0, false, false](max_gradient); let split = fork-split(max_gradient); clean-monoid-reduces(max_gradient); let out = outline(split._4_max_gradient.fj1); @@ -105,7 +105,7 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); fork-tile[4, 1, false, false](reject_zero_crossings); -fork-tile[4, 0, false, false](reject_zero_crossings); +fork-tile[8, 0, false, false](reject_zero_crossings); fork-interchange[1, 2](reject_zero_crossings); let split = fork-split(reject_zero_crossings); let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 58f364dc1cea77b27c83ba2340bfbd11fde07f31..3e49cb365b186037cf6f380c3aa6a4d3b483fb5b 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Copy data for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else input[row + i - r, col + j - r]; - } - } - // Compute pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else input[row + i - r, col + j - r]; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else input[row + i - r, col + j - r]; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else input[row + i - r, col + j - r]; + eroded = min!(eroded, filter * structure[i, j]); } } @@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Data copy for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; - } - } - // Compute the pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + eroded = min!(eroded, filter * structure[i, j]); } } @@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>( let gx = 0; let gy = 0; - for i = 0 to sb { + @filter_loop for i = 0 to sb { for j = 0 to sb { let val = input[if row + i < sbr then 0 else if row + i - sbr > n - 1 then n - 1 diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 7ee2904f7d1bb59780390360dbd9abc6b3934aba..666f6cef30f0c0ecedef57f02e50bc25d2b26b8f 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -26,22 +26,49 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); +fork-dim-merge(gaussian_smoothing@filter_loop); +unforkify(gaussian_smoothing@filter_loop); +simpl!(gaussian_smoothing); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +fork-dim-merge(gaussian_smoothing); +fork-tile[32, 0, false, true](gaussian_smoothing); +simpl!(gaussian_smoothing); +fork-split(gaussian_smoothing); +simpl!(gaussian_smoothing); + +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); +fork-dim-merge(laplacian_estimate@filter_loop); +unforkify(laplacian_estimate@filter_loop); +simpl!(laplacian_estimate); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +fork-dim-merge(laplacian_estimate); +fork-tile[32, 0, false, true](laplacian_estimate); +simpl!(laplacian_estimate); +fork-split(laplacian_estimate); +simpl!(laplacian_estimate); + +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); fork-coalesce(zero_crossings); } simpl!(zero_crossings); +fork-dim-merge(zero_crossings@filter_loop); +unforkify(zero_crossings@filter_loop); +simpl!(zero_crossings); + +fork-dim-merge(zero_crossings); +fork-tile[32, 0, false, true](zero_crossings); +simpl!(zero_crossings); +fork-split(zero_crossings); +simpl!(zero_crossings); no-memset(gradient@res); fixpoint { @@ -53,6 +80,15 @@ predication(gradient); simpl!(gradient); predication(gradient); simpl!(gradient); +fork-dim-merge(gradient@filter_loop); +unforkify(gradient@filter_loop); +simpl!(gradient); + +fork-dim-merge(gradient); +fork-tile[32, 0, false, true](gradient); +simpl!(gradient); +fork-split(gradient); +simpl!(gradient); fixpoint { forkify(max_gradient); @@ -88,6 +124,12 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); +fork-dim-merge(reject_zero_crossings); +fork-tile[32, 0, false, true](reject_zero_crossings); +simpl!(reject_zero_crossings); +fork-split(reject_zero_crossings); +simpl!(reject_zero_crossings); + async-call(edge_detection@le, edge_detection@zc); simpl!(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 4cf320a6d8f8d1245272c0855f567a1834939963..7a284a9a3bed0c1800366c64ceeca20265d845a6 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); simpl!(*); ip-sroa[true](*); @@ -24,8 +25,31 @@ fixpoint { fork-guard-elim(*); } simpl!(*); -no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); -parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._6_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._7_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._8_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); -unforkify(*); +unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 14eb690641107e7e807cd24b741ecf9098553891..518c656d99652dd9813e6aa5c8e8e84388836d47 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -24,7 +24,38 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); +unforkify(compute_flux@inner_loop); -unforkify(*); +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); +fork-tile[32, 0, false, false](compute_flux_contributions); +let split = fork-split(compute_flux_contributions); +let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); +fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); +simpl!(compute_flux_contributions, compute_flux_contributions_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._7_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._8_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._9_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); + +unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch index aed6115e7cf790a9274862b1ab2e4d099a194a32..3700f79d24aa22657020ace6a37931feb49048a4 100644 --- a/juno_samples/rodinia/cfd/src/gpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); gpu(copy_vars, compute_step_factor, compute_flux, time_step); @@ -26,9 +27,18 @@ fixpoint { fork-guard-elim(*); } simpl!(*); -no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); -parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); + +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); -unforkify(*); -float-collections(*); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch index d91f1b001ca1fe4b7ff544c3f8cab561490acd00..d6db675b0682490a84ba30408c3dd96993dce0f9 100644 --- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); delete-uncalled(*); gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step); @@ -26,7 +27,21 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux_contributions); +fork-split(compute_flux_contributions); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); + +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); -unforkify(*); -float-collections(*); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index 62ee59f4f285b3a829ad95a9a323811b3b3895b7..d61df4c5b7bb0c7915d8605debd72e375a5482b0 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -48,8 +48,7 @@ fn run_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = - async_std::task::block_on(async { + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, @@ -123,8 +122,7 @@ fn run_pre_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(pre_euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = - async_std::task::block_on(async { + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, @@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool { let ys_energy = ys.energy.as_slice(); xs_density.len() == ys_density.len() - && xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_density + .iter() + .zip(ys_density.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_x.len() == ys_momentum_x.len() - && xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_x + .iter() + .zip(ys_momentum_x.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_y.len() == ys_momentum_y.len() - && xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_y + .iter() + .zip(ys_momentum_y.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_z.len() == ys_momentum_z.len() - && xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_z + .iter() + .zip(ys_momentum_z.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_energy.len() == ys_energy.len() - && xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_energy + .iter() + .zip(ys_energy.iter()) + .all(|(x, y)| compare_float(*x, *y)) } pub fn cfd_harness(args: CFDInputs) { @@ -224,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) { } = read_domain_geometry(data_file, block_size); let variables = initialize_variables(nelr, &ff_variable); + println!("Running CFD with nelr = {}.", nelr); let res_juno = if pre_euler { run_pre_euler( diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn index c200f2db9a2be4708c4cf1c90c60eef0b7d7a11e..979c2e9a16d3ff2d0b5c690af7645dbf10462139 100644 --- a/juno_samples/rodinia/cfd/src/pre_euler.jn +++ b/juno_samples/rodinia/cfd/src/pre_euler.jn @@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { } fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] { - let step_factors : f32[nelr]; + @res let step_factors : f32[nelr]; for i in 0..nelr { let density = variables.density[i]; @@ -109,10 +109,10 @@ fn compute_flux_contribution( fn compute_flux_contributions<nelr: usize>( variables: Variables::<nelr>, ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) { - let fc_momentum_x: Momentum::<nelr>; - let fc_momentum_y: Momentum::<nelr>; - let fc_momentum_z: Momentum::<nelr>; - let fc_density_energy: Momentum::<nelr>; + @res let fc_momentum_x: Momentum::<nelr>; + @res let fc_momentum_y: Momentum::<nelr>; + @res let fc_momentum_z: Momentum::<nelr>; + @res let fc_density_energy: Momentum::<nelr>; for i in 0..nelr { let density_i = variables.density[i]; @@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>( ff_fc_momentum_z: float3, ) -> Variables::<nelr> { const smoothing_coefficient : f32 = 0.2; - let fluxes: Variables::<nelr>; + @res let fluxes: Variables::<nelr>; - for i in 0..nelr { + @outer_loop for i in 0..nelr { let density_i = variables.density[i]; let momentum_i = float3 { x: variables.momentum.x[i], @@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>( let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 }; let flux_i_density_energy : f32 = 0.0; - for j in 0..NNB { + @inner_loop for j in 0..NNB { let nb = elements_surrounding_elements[j, i]; let normal = float3 { x: normals.x[j, i], @@ -328,7 +328,7 @@ fn time_step<nelr: usize>( } fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> { - let result : Variables::<nelr>; + @res let result : Variables::<nelr>; for i in 0..nelr { result.density[i] = variables.density[i]; diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index 289548f9e01cdf402a3e1b1057fa52d4029f6173..f736c0b776759596cfc93a5025fb4f280c3fc932 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -54,4 +54,10 @@ ip-sroa(*); sroa(*); simpl!(*); +fork-dim-merge(main_loops); +fork-tile[32, 0, false, true](main_loops); +dce(main_loops); +fork-split(main_loops); +simpl!(main_loops); + gcm(*); diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 456df2eda49b93a6c80327a090b6f6606ae711bb..62bdaf739b69b52a0b540c6a602e1e5beb632786 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -1090,7 +1090,9 @@ impl PassManager { let mut nvcc_process = Command::new("nvcc") .arg("-c") + .arg("-Xptxas") .arg("-O3") + .arg("-use_fast_math") .arg("-diag-suppress") .arg("177") .arg("-o")