From b1970233ee1f807a133e7badf7b7b3a4eda7f803 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 17:53:32 -0600 Subject: [PATCH 01/25] verify flag for cfd --- juno_samples/rodinia/cfd/src/lib.rs | 67 +++++++++++++++------------- juno_samples/rodinia/cfd/src/main.rs | 2 + 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index d61df4c5..a9800ed0 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -19,6 +19,8 @@ pub struct CFDInputs { pub block_size: usize, #[clap(short = None, long = Some("pre-euler"))] pub pre_euler: bool, + #[clap(short, long)] + pub verify: bool, } fn run_euler( @@ -219,6 +221,7 @@ pub fn cfd_harness(args: CFDInputs) { iterations, block_size, pre_euler, + verify, } = args; let FarFieldConditions { @@ -268,37 +271,39 @@ pub fn cfd_harness(args: CFDInputs) { &ff_fc_momentum_z, ) }; - let res_rust = if pre_euler { - rust_cfd::pre_euler( - nelr, - iterations, - variables, - areas.as_slice(), - elements_surrounding_elements.as_slice(), - &normals, - &ff_variable, - &ff_fc_density_energy, - &ff_fc_momentum_x, - &ff_fc_momentum_y, - &ff_fc_momentum_z, - ) - } else { - rust_cfd::euler( - nelr, - iterations, - variables, - areas.as_slice(), - elements_surrounding_elements.as_slice(), - &normals, - &ff_variable, - &ff_fc_density_energy, - &ff_fc_momentum_x, - &ff_fc_momentum_y, - &ff_fc_momentum_z, - ) - }; + if verify { + let res_rust = if pre_euler { + rust_cfd::pre_euler( + nelr, + iterations, + variables, + areas.as_slice(), + elements_surrounding_elements.as_slice(), + &normals, + &ff_variable, + &ff_fc_density_energy, + &ff_fc_momentum_x, + &ff_fc_momentum_y, + &ff_fc_momentum_z, + ) + } else { + rust_cfd::euler( + nelr, + iterations, + variables, + areas.as_slice(), + elements_surrounding_elements.as_slice(), + &normals, + &ff_variable, + &ff_fc_density_energy, + &ff_fc_momentum_x, + &ff_fc_momentum_y, + &ff_fc_momentum_z, + ) + }; - if !compare_floats(&res_juno, &res_rust) { - panic!("Mismatch in results"); + if !compare_floats(&res_juno, &res_rust) { + panic!("Mismatch in results"); + } } } diff --git a/juno_samples/rodinia/cfd/src/main.rs b/juno_samples/rodinia/cfd/src/main.rs index 277a3edb..2dd9a364 100644 --- a/juno_samples/rodinia/cfd/src/main.rs +++ b/juno_samples/rodinia/cfd/src/main.rs @@ -14,6 +14,7 @@ fn test_euler() { iterations: 1, block_size: 16, pre_euler: false, + verify: true, }); } @@ -24,5 +25,6 @@ fn test_pre_euler() { iterations: 1, block_size: 16, pre_euler: true, + verify: true, }); } -- GitLab From 54438766872b92e3ae3e279ae0ab142d3efedafd Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 18:41:31 -0600 Subject: [PATCH 02/25] some backprop opt --- juno_samples/rodinia/backprop/src/backprop.jn | 31 +++++---- juno_samples/rodinia/backprop/src/cpu.sch | 4 +- juno_samples/rodinia/backprop/src/gpu.sch | 63 +++++++++++++------ 3 files changed, 66 insertions(+), 32 deletions(-) diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index 94c4334c..2ca57c9f 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -6,10 +6,9 @@ fn squash(x: f32) -> f32 { fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] { @res let result : f32[m + 1]; result[0] = 1.0; - @outer_loop for j in 1..=m { - let sum = 0.0; - @inner_loop for k in 0..=n { + let sum = weights[0, j] * vals[0]; + @inner_loop for k in 1..=n { sum += weights[k, j] * vals[k]; } result[j] = squash(sum); @@ -19,13 +18,16 @@ fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f } fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> f32, f32[n + 1] { - let errsum = 0.0; - let delta : f32[n + 1]; - - for j in 1..=n { + @loop1 @res let delta : f32[n + 1]; + @loop1 delta[0] = 0.0; + @loop1 for j in 1..=n { let a = actual[j]; let t = target[j]; delta[j] = a * (1.0 - a) * (t - a); + } + + let errsum = 0.0; + @loop2 for j in 1..=n { errsum += abs!(delta[j]); } @@ -37,10 +39,9 @@ fn hidden_error<hidden_n, output_n: usize>( hidden_weights: f32[hidden_n + 1, output_n + 1], hidden_vals: f32[hidden_n + 1], ) -> f32, f32[hidden_n + 1] { - let errsum = 0.0; - let delta : f32[hidden_n + 1]; - - for j in 1..=hidden_n { + @loop1 @res let delta : f32[hidden_n + 1]; + @loop1 delta[0] = 0.0; + @loop1 for j in 1..=hidden_n { let h = hidden_vals[j]; let sum = 0.0; @@ -49,6 +50,10 @@ fn hidden_error<hidden_n, output_n: usize>( } delta[j] = h * (1.0 - h) * sum; + } + + let errsum = 0.0; + @loop2 for j in 1..=hidden_n { errsum += abs!(delta[j]); } @@ -89,8 +94,8 @@ fn backprop<input_n, hidden_n, output_n: usize>( let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights); let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights); - let out_err, out_delta = output_error::<output_n>(target, output_vals); - let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals); + @output_error let out_err, out_delta = output_error::<output_n>(target, output_vals); + @hidden_error let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals); let hidden_weights, hidden_prev_weights = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights); diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index de34d660..661ec531 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -12,7 +12,7 @@ simpl!(*); inline(layer_forward); delete-uncalled(*); -no-memset(layer_forward@res); +no-memset(layer_forward@res, output_error@res, hidden_error@res); lift-dc-math(*); loop-bound-canon(*); simpl!(*); @@ -25,6 +25,8 @@ fixpoint { } reduce-slf(*); simpl!(*); +fork-interchange[0, 1](adjust_weights); +simpl!(*); fork-split(*); unforkify(*); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index 2011860d..d0be79db 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -1,24 +1,51 @@ -gvn(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +no-memset(layer_forward@res, output_error@res, hidden_error@res); phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +let output_loop1 = outline(output_error@loop1); +let output_loop2 = outline(output_error@loop2); +let hidden_loop1 = outline(hidden_error@loop1); +let hidden_loop2 = outline(hidden_error@loop2); +simpl!(*); +inline(layer_forward, backprop@output_error, backprop@hidden_error); +delete-uncalled(*); +gpu(layer_forward, output_loop1, output_loop2, hidden_loop1, hidden_loop2, adjust_weights); +const-inline(*); -let auto = auto-outline(backprop); -gpu(auto.backprop); +lift-dc-math(*); +loop-bound-canon(*); +simpl!(*); +lift-dc-math(*); +slf(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} +reduce-slf(*); +simpl!(*); -inline(auto.backprop); -inline(auto.backprop); -delete-uncalled(*); +fork-tile[16, 0, false, true](layer_forward@inner_loop); +let out = fork-split(layer_forward@inner_loop); +clean-monoid-reduces(layer_forward); +simpl!(layer_forward); +let fission = fork-fission[out._1_layer_forward.fj0](layer_forward); +simpl!(layer_forward); -sroa[true](*); -dce(*); -float-collections(*); -reuse-products(*); -dce(*); +fork-dim-merge(adjust_weights); +simpl!(adjust_weights); +fork-extend[32](adjust_weights); +fork-tile[32, 0, false, true](adjust_weights); +fork-split(adjust_weights); +simpl!(adjust_weights); +xdot[true](*); gcm(*); - -- GitLab From f092ae383b280a6df9778e62198c4341d2f1e8ad Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 19:15:45 -0600 Subject: [PATCH 03/25] more backprop opt --- hercules_opt/src/fork_transforms.rs | 102 ++++++++++++++++++---- hercules_opt/src/simplify_cfg.rs | 15 +++- juno_samples/rodinia/backprop/src/gpu.sch | 7 +- 3 files changed, 106 insertions(+), 18 deletions(-) diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index 1c220b99..e1598463 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -319,12 +319,12 @@ pub fn fork_fission<'a>( .collect(); let mut created_forks = Vec::new(); - - // This does the reduction fission + + // This does the reduction fission for fork in forks { let join = fork_join_map[&fork.0]; - // FIXME: Don't make multiple forks for reduces that are in cycles with each other. + // FIXME: Don't make multiple forks for reduces that are in cycles with each other. let reduce_partition = default_reduce_partition(editor, fork.0, join); if !editor.func().labels[fork.0.idx()].contains(&fork_label) { @@ -332,14 +332,19 @@ pub fn fork_fission<'a>( } if editor.is_mutable(fork.0) { - created_forks = fork_reduce_fission_helper(editor, fork_join_map, reduce_partition, nodes_in_fork_joins, fork.0); + created_forks = fork_reduce_fission_helper( + editor, + fork_join_map, + reduce_partition, + nodes_in_fork_joins, + fork.0, + ); if created_forks.is_empty() { continue; } else { return created_forks; } } - } created_forks @@ -503,13 +508,17 @@ pub fn fork_reduce_fission_helper<'a>( let mut new_forks = Vec::new(); - let mut new_control_pred: NodeID = editor.get_uses(fork).filter(|n| editor.node(n).is_control()).next().unwrap(); + let mut new_control_pred: NodeID = editor + .get_uses(fork) + .filter(|n| editor.node(n).is_control()) + .next() + .unwrap(); let mut new_fork = NodeID::new(0); let mut new_join = NodeID::new(0); - let subgraph = &nodes_in_fork_joins[&fork]; - + let subgraph = &nodes_in_fork_joins[&fork]; + // Gets everything between fork & join that this reduce needs. (ALL CONTROL) editor.edit(|mut edit| { for reduce in reduce_partition { @@ -522,7 +531,7 @@ pub fn fork_reduce_fission_helper<'a>( new_fork = mapping[&fork]; new_forks.push(new_fork); new_join = mapping[&join]; - + // Atttach new_fork after control_pred let (old_control_pred, _) = edit.get_node(new_fork).try_fork().unwrap().clone(); edit = edit.replace_all_uses_where(old_control_pred, new_control_pred, |usee| { @@ -532,7 +541,7 @@ pub fn fork_reduce_fission_helper<'a>( // Replace uses of reduce edit = edit.replace_all_uses(reduce, mapping[&reduce])?; new_control_pred = new_join; - }; + } // Replace original join w/ new final join edit = edit.replace_all_uses_where(join, new_join, |_| true)?; @@ -1502,6 +1511,10 @@ fn fork_fusion( * element. This aides in parallelizing outer loops. Looks only at reduces with * the monoid reduce schedule, since that indicates a particular structure which * is annoying to check for again. + * + * Looks for would-be monoid reduces, if not for a gate on the reduction. + * Partially predicate the gated reduction to allow for a proper monoid + * reduction. */ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { for id in editor.node_ids() { @@ -1512,7 +1525,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { let Some((_, init, reduct)) = nodes[id.idx()].try_reduce() else { continue; }; - let out_uses: Vec<_> = editor.get_users(id).filter(|id| *id != reduct).collect(); + let out_users: Vec<_> = editor.get_users(id).filter(|id| *id != reduct).collect(); match nodes[reduct.idx()] { Node::Binary { @@ -1532,7 +1545,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: init, right: id, }); - for u in out_uses { + for u in out_users { edit.sub_edit(u, final_op); } edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) @@ -1555,7 +1568,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: init, right: id, }); - for u in out_uses { + for u in out_users { edit.sub_edit(u, final_op); } edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) @@ -1574,7 +1587,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { intrinsic: Intrinsic::Max, args: Box::new([init, id]), }); - for u in out_uses { + for u in out_users { edit.sub_edit(u, final_op); } edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) @@ -1593,7 +1606,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { intrinsic: Intrinsic::Min, args: Box::new([init, id]), }); - for u in out_uses { + for u in out_users { edit.sub_edit(u, final_op); } edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) @@ -1602,6 +1615,65 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { _ => {} } } + + for id in editor.node_ids() { + if !editor.func().schedules[id.idx()].contains(&Schedule::MonoidReduce) { + continue; + } + let nodes = &editor.func().nodes; + let Some((control, init, reduct)) = nodes[id.idx()].try_reduce() else { + continue; + }; + if let Node::Phi { + control: phi_control, + ref data, + } = nodes[reduct.idx()] + && data.len() == 2 + && data.contains(&id) + && let other = *data + .into_iter() + .filter(|other| **other != id) + .next() + .unwrap() + && let Node::Binary { + op: BinaryOperator::Add, + left, + right, + } = nodes[other.idx()] + && ((left == id) ^ (right == id)) + { + let gated_input = if left == id { right } else { left }; + let data = data.clone(); + editor.edit(|mut edit| { + let zero = edit.add_zero_constant(typing[id.idx()]); + let zero = edit.add_node(Node::Constant { id: zero }); + let phi = edit.add_node(Node::Phi { + control: phi_control, + data: data + .iter() + .map(|phi_use| if *phi_use == id { zero } else { gated_input }) + .collect(), + }); + let new_reduce_id = NodeID::new(edit.num_node_ids()); + let new_reduct_id = NodeID::new(edit.num_node_ids() + 1); + let new_reduce = Node::Reduce { + control, + init, + reduct: new_reduct_id, + }; + let new_add = Node::Binary { + op: BinaryOperator::Add, + left: new_reduce_id, + right: phi, + }; + let new_reduce = edit.add_node(new_reduce); + edit.add_node(new_add); + edit = edit.replace_all_uses(id, new_reduce)?; + edit = edit.delete_node(id)?; + Ok(edit) + }); + } + } } /* diff --git a/hercules_opt/src/simplify_cfg.rs b/hercules_opt/src/simplify_cfg.rs index cf39db2b..b13cf0c3 100644 --- a/hercules_opt/src/simplify_cfg.rs +++ b/hercules_opt/src/simplify_cfg.rs @@ -126,11 +126,24 @@ fn remove_useless_fork_joins( // Third, get rid of fork-joins. for (fork, join) in fork_join_map { - if editor.get_users(*fork).len() == 1 && editor.get_users(*join).len() == 1 { + if editor.get_users(*join).len() == 1 { let fork_use = get_uses(&editor.func().nodes[fork.idx()]).as_ref()[0]; let join_use = get_uses(&editor.func().nodes[join.idx()]).as_ref()[0]; + let tids: Vec<_> = editor + .get_users(*fork) + .filter(|id| editor.func().nodes[id.idx()].is_thread_id()) + .collect(); editor.edit(|mut edit| { + if !tids.is_empty() { + let u64_ty = edit.add_type(Type::UnsignedInteger64); + let zero = edit.add_zero_constant(u64_ty); + let zero = edit.add_node(Node::Constant { id: zero }); + for tid in tids { + edit = edit.replace_all_uses(tid, zero)?; + edit = edit.delete_node(tid)?; + } + } edit = edit.replace_all_uses(*join, join_use)?; edit = edit.replace_all_uses(*fork, fork_use)?; edit = edit.delete_node(*fork)?; diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index d0be79db..f8cc84a3 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,7 +33,11 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-tile[16, 0, false, true](layer_forward@inner_loop); +fork-extend[32](layer_forward@inner_loop); +clean-monoid-reduces(layer_forward); +simpl!(layer_forward); +fork-tile[32, 0, false, true](layer_forward@inner_loop); +clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); @@ -47,5 +51,4 @@ fork-tile[32, 0, false, true](adjust_weights); fork-split(adjust_weights); simpl!(adjust_weights); -xdot[true](*); gcm(*); -- GitLab From 3b1aa5d426c2c43032abef5a0505fc935da2eadc Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 21:19:31 -0600 Subject: [PATCH 04/25] parallelize main loop in bfs --- juno_samples/rodinia/bfs/src/cpu.sch | 13 +++++++++++-- juno_samples/rodinia/bfs/src/lib.rs | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index ae67fdd9..f94c473f 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -10,13 +10,15 @@ macro simpl!(X) { phi-elim(bfs); no-memset(bfs@cost); -outline(bfs@cost_init); +let init = outline(bfs@cost_init); let loop1 = outline(bfs@loop1); let loop2 = outline(bfs@loop2); +parallel-reduce(loop1); simpl!(*); predication(*); const-inline(*); +loop-bound-canon(*); simpl!(*); fixpoint { forkify(*); @@ -26,5 +28,12 @@ simpl!(*); predication(*); simpl!(*); -unforkify(*); +fork-tile[32, 0, false, false](loop1); +let split = fork-split(loop1); +let out = outline(split.bfs_1.fj1); +unforkify(out); +inline(bfs@loop1); +delete-uncalled(*); + +unforkify(init); gcm(*); diff --git a/juno_samples/rodinia/bfs/src/lib.rs b/juno_samples/rodinia/bfs/src/lib.rs index 218e9bb0..f6403540 100644 --- a/juno_samples/rodinia/bfs/src/lib.rs +++ b/juno_samples/rodinia/bfs/src/lib.rs @@ -19,6 +19,7 @@ pub struct BFSInputs { fn run_bfs(nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> { let n = nodes.len() as u64; let m = edges.len() as u64; + println!("Running with {} nodes and {} edges.", n, m); let nodes = HerculesImmBox::from(nodes); let edges = HerculesImmBox::from(edges); -- GitLab From 2598806c43fe6e518657b2c6bffb866695fdcab5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 21:50:22 -0600 Subject: [PATCH 05/25] work on bfs --- juno_samples/rodinia/bfs/src/bfs.jn | 11 ++++++----- juno_samples/rodinia/bfs/src/cpu.sch | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index 2534a89c..3d0280f1 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -44,11 +44,12 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] @loop2 for i in 0..n { stop = stop && !updated[i]; - if updated[i] { - mask[i] = true; - visited[i] = true; - updated[i] = false; - } + } + + @loop3 for i in 0..n { + mask[i] = mask[i] || updated[i]; + visited[i] = visited[i] || updated[i]; + updated[i] = false; } } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index f94c473f..a33e361d 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -11,9 +11,9 @@ macro simpl!(X) { phi-elim(bfs); no-memset(bfs@cost); let init = outline(bfs@cost_init); -let loop1 = outline(bfs@loop1); -let loop2 = outline(bfs@loop2); -parallel-reduce(loop1); +let traverse = outline(bfs@loop1); +let collect = outline(bfs@loop2 | bfs@loop3); +parallel-reduce(traverse); simpl!(*); predication(*); @@ -27,13 +27,15 @@ fixpoint { simpl!(*); predication(*); simpl!(*); +reduce-slf(*); +simpl!(*); -fork-tile[32, 0, false, false](loop1); -let split = fork-split(loop1); -let out = outline(split.bfs_1.fj1); -unforkify(out); -inline(bfs@loop1); -delete-uncalled(*); +fixpoint { + forkify(collect); + fork-guard-elim(collect); +} +fork-fusion(collect); +simpl!(collect); -unforkify(init); +unforkify(init, traverse, collect); gcm(*); -- GitLab From 84d2978f414a3ba57f70bd359753a3184eae4f49 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 22:37:06 -0600 Subject: [PATCH 06/25] bfs w/ and reduction --- juno_samples/rodinia/bfs/src/gpu.sch | 50 +++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 6c4d027b..56489a23 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -10,14 +10,16 @@ macro simpl!(X) { phi-elim(bfs); no-memset(bfs@cost); -let cost_init = outline(bfs@cost_init); +let init = outline(bfs@cost_init); let loop1 = outline(bfs@loop1); let loop2 = outline(bfs@loop2); -gpu(loop1, loop2); +let loop3 = outline(bfs@loop3); +parallel-reduce(loop1); simpl!(*); predication(*); const-inline(*); +loop-bound-canon(*); simpl!(*); fixpoint { forkify(*); @@ -26,14 +28,44 @@ fixpoint { simpl!(*); predication(*); simpl!(*); - -unforkify(cost_init); -parallel-reduce(loop1); -forkify(*); -fork-guard-elim(*); -simpl!(*); -predication(*); reduce-slf(*); simpl!(*); +fork-tile[32, 0, false, true](loop1); +fork-split(loop1); +gpu(loop1); + +fixpoint { + forkify(loop2, loop3); + fork-guard-elim(loop2, loop3); +} + +simpl!(loop2, loop3); +fork-tile[32, 0, false, true](loop2, loop3); +let out = fork-split(loop2, loop3); +clean-monoid-reduces(loop2, loop3); +simpl!(loop2, loop3); +gpu(loop3); + +let fission1 = fork-fission[out.bfs_2.fj0](loop2); +simpl!(loop2); +fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom); +let out = fork-split(fission1.bfs_2.fj_bottom); +clean-monoid-reduces(loop2); +simpl!(loop2); +let fission2 = fork-fission[out.bfs_2.fj0](loop2); +simpl!(loop2); +fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom); +let out = fork-split(fission2.bfs_2.fj_bottom); +clean-monoid-reduces(loop2); +simpl!(loop2); +let top = outline(fission1.bfs_2.fj_top); +let middle = outline(fission2.bfs_2.fj_top); +let bottom = outline(out.bfs_2.fj0); +const-inline(loop2, top, middle, bottom); +no-memset(top, middle); +gpu(top, middle, bottom); +simpl!(loop2, top, middle, bottom); + +unforkify(init); gcm(*); -- GitLab From 76a82eaf47b1dba9bbd74c2aa540b0b0bfa2b6dc Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 22:38:53 -0600 Subject: [PATCH 07/25] ignore too small test bfs for now... --- juno_samples/rodinia/bfs/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/juno_samples/rodinia/bfs/src/main.rs b/juno_samples/rodinia/bfs/src/main.rs index 0ad23b00..b0a74bbd 100644 --- a/juno_samples/rodinia/bfs/src/main.rs +++ b/juno_samples/rodinia/bfs/src/main.rs @@ -8,6 +8,7 @@ fn main() { } #[test] +#[ignore] fn bfs_test_4096() { bfs_harness(BFSInputs { input: "data/graph4096.txt".to_string(), -- GitLab From 375199f5369a1237f52edd82af8c0167219f2cce Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Mon, 3 Mar 2025 11:39:24 -0600 Subject: [PATCH 08/25] some tweaks --- hercules_opt/src/fork_transforms.rs | 6 ++++-- hercules_opt/src/gcm.rs | 14 +++++++++++--- hercules_opt/src/utils.rs | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e1598463..6998f879 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1533,7 +1533,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: _, right: _, } if (op == BinaryOperator::Add || op == BinaryOperator::Or) - && !is_zero(editor, init) => + && !is_zero(editor, init) + && !is_false(editor, init) => { editor.edit(|mut edit| { let zero = edit.add_zero_constant(typing[init.idx()]); @@ -1556,7 +1557,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: _, right: _, } if (op == BinaryOperator::Mul || op == BinaryOperator::And) - && !is_one(editor, init) => + && !is_one(editor, init) + && !is_true(editor, init) => { editor.edit(|mut edit| { let one = edit.add_one_constant(typing[init.idx()]); diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index d950941a..4a6365c8 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -212,7 +212,8 @@ fn preliminary_fixups( let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap(); // Replace uses of the reduce in its cycle with the init. - let success = editor.edit(|edit| { + let success = editor.edit(|mut edit| { + edit = edit.add_schedule(init, Schedule::ParallelReduce)?; edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id)) }); assert!(success); @@ -870,7 +871,7 @@ fn spill_clones( // Step 2: filter edges (A, B) to just see edges where A uses B and A // mutates B. These are the edges that may require a spill. let mut spill_edges = edges.into_iter().filter(|(a, b)| { - mutating_writes(editor.func(), *a, objects).any(|id| id == *b) + (mutating_writes(editor.func(), *a, objects).any(|id| id == *b) || (get_uses(&editor.func().nodes[a.idx()]) .as_ref() .into_iter() @@ -890,7 +891,14 @@ fn spill_clones( data.contains(b) && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) }) - .unwrap_or(false)) + .unwrap_or(false))) + && !editor.func().nodes[a.idx()] + .try_write() + .map(|(collect, _, _)| { + collect == *b + && editor.func().schedules[b.idx()].contains(&Schedule::ParallelReduce) + }) + .unwrap_or(false) }); // Step 3: if there is a spill edge, spill it and return true. Otherwise, diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index b910a128..351abc2b 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -598,6 +598,24 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool { || nodes[id.idx()].is_undef() } +pub fn is_false(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_false()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + +pub fn is_true(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_true()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool { let nodes = &editor.func().nodes; nodes[id.idx()] -- GitLab From 4cd1d6108d7c37701d0136818cc643304837aa5f Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Mon, 3 Mar 2025 12:28:33 -0600 Subject: [PATCH 09/25] opt bfs --- juno_samples/rodinia/bfs/src/bfs.jn | 24 +++++++++------ juno_samples/rodinia/bfs/src/cpu.sch | 6 ++-- juno_samples/rodinia/bfs/src/gpu.sch | 46 ++++++---------------------- 3 files changed, 28 insertions(+), 48 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index 3d0280f1..f82d9d80 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -1,4 +1,11 @@ type Node = struct { edge_start: u32; num_edges: u32; }; +type StopProd = struct { stop: bool; }; + +fn make_stop_prod() -> StopProd { + let ret : StopProd; + ret.stop = true; + return ret; +} #[entry] fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] { @@ -23,8 +30,6 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let updated: bool[n]; while !stop { - stop = true; - @loop1 for i in 0..n { if mask[i] { mask[i] = false; @@ -42,15 +47,16 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } } + @make let stop_prod = make_stop_prod(); @loop2 for i in 0..n { - stop = stop && !updated[i]; - } - - @loop3 for i in 0..n { - mask[i] = mask[i] || updated[i]; - visited[i] = visited[i] || updated[i]; - updated[i] = false; + if updated[i] { + mask[i] = true; + visited[i] = true; + updated[i] = false; + stop_prod.stop = updated[i]; + } } + stop = stop_prod.stop; } return cost; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index a33e361d..589b93b1 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -12,8 +12,7 @@ phi-elim(bfs); no-memset(bfs@cost); let init = outline(bfs@cost_init); let traverse = outline(bfs@loop1); -let collect = outline(bfs@loop2 | bfs@loop3); -parallel-reduce(traverse); +let collect = outline(bfs@loop2); simpl!(*); predication(*); @@ -29,12 +28,13 @@ predication(*); simpl!(*); reduce-slf(*); simpl!(*); +slf(*); +simpl!(*); fixpoint { forkify(collect); fork-guard-elim(collect); } -fork-fusion(collect); simpl!(collect); unforkify(init, traverse, collect); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 56489a23..d5c8dee6 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -11,10 +11,11 @@ macro simpl!(X) { phi-elim(bfs); no-memset(bfs@cost); let init = outline(bfs@cost_init); -let loop1 = outline(bfs@loop1); -let loop2 = outline(bfs@loop2); -let loop3 = outline(bfs@loop3); -parallel-reduce(loop1); +let traverse = outline(bfs@loop1); +let collect = outline(bfs@loop2); +parallel-reduce(traverse, collect); +no-memset(make_stop_prod); +gpu(traverse, make_stop_prod, collect); simpl!(*); predication(*); @@ -31,41 +32,14 @@ simpl!(*); reduce-slf(*); simpl!(*); -fork-tile[32, 0, false, true](loop1); -fork-split(loop1); -gpu(loop1); - fixpoint { - forkify(loop2, loop3); - fork-guard-elim(loop2, loop3); + forkify(collect); + fork-guard-elim(collect); } +simpl!(collect); -simpl!(loop2, loop3); -fork-tile[32, 0, false, true](loop2, loop3); -let out = fork-split(loop2, loop3); -clean-monoid-reduces(loop2, loop3); -simpl!(loop2, loop3); -gpu(loop3); - -let fission1 = fork-fission[out.bfs_2.fj0](loop2); -simpl!(loop2); -fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom); -let out = fork-split(fission1.bfs_2.fj_bottom); -clean-monoid-reduces(loop2); -simpl!(loop2); -let fission2 = fork-fission[out.bfs_2.fj0](loop2); -simpl!(loop2); -fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom); -let out = fork-split(fission2.bfs_2.fj_bottom); -clean-monoid-reduces(loop2); -simpl!(loop2); -let top = outline(fission1.bfs_2.fj_top); -let middle = outline(fission2.bfs_2.fj_top); -let bottom = outline(out.bfs_2.fj0); -const-inline(loop2, top, middle, bottom); -no-memset(top, middle); -gpu(top, middle, bottom); -simpl!(loop2, top, middle, bottom); +fork-tile[32, 0, false, true](traverse, collect); +fork-split(traverse, collect); unforkify(init); gcm(*); -- GitLab From 7b6ba729fdeaf8e76a2bb2aa2631f931a4525070 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Mon, 3 Mar 2025 13:25:13 -0600 Subject: [PATCH 10/25] more bfs opt --- hercules_cg/src/gpu.rs | 18 +++++++++++------- juno_samples/rodinia/bfs/src/gpu.sch | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 07dd3ebf..3a00e547 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,6 +354,7 @@ impl GPUContext<'_> { write!( w, " +#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -1294,7 +1295,7 @@ namespace cg = cooperative_groups; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } if !is_primitive && state == KernelState::OutBlock { @@ -1311,6 +1312,7 @@ namespace cg = cooperative_groups; } if !is_primitive && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false)) + && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant) { let data_size = self.get_size(self.typing[id.idx()], None); write!( @@ -1320,7 +1322,8 @@ namespace cg = cooperative_groups; )?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + write!(w, "__syncthreads\n")?; } } // Dynamic constants emitted at top @@ -1595,7 +1598,7 @@ namespace cg = cooperative_groups; write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; } - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1705,20 +1708,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { @@ -1780,7 +1783,8 @@ namespace cg = cooperative_groups; } let fork = self.join_fork_map.get(&id).unwrap(); let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + //write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + write!(w_term, "\t__syncthreads;\n")?; } // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index d5c8dee6..4e5c1f74 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -38,7 +38,7 @@ fixpoint { } simpl!(collect); -fork-tile[32, 0, false, true](traverse, collect); +fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); unforkify(init); -- GitLab From d303b1850dfd142871ee9cb819f0edff33334d6a Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Mon, 3 Mar 2025 13:35:02 -0600 Subject: [PATCH 11/25] fix syncthreads hack --- hercules_cg/src/gpu.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3a00e547..1e6067a3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -562,8 +562,9 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; Ok(()) } @@ -1322,8 +1323,8 @@ namespace cg = cooperative_groups; )?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; write!(w, "{}}}\n", tabs)?; - //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; - write!(w, "__syncthreads\n")?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "__syncthreads\n")?; } } // Dynamic constants emitted at top @@ -1783,8 +1784,8 @@ namespace cg = cooperative_groups; } let fork = self.join_fork_map.get(&id).unwrap(); let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); - //write!(w_term, "\t{}.sync();\n", cg_tile_available)?; - write!(w_term, "\t__syncthreads;\n")?; + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + //write!(w_term, "\t__syncthreads;\n")?; } // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the -- GitLab From 416ed7b1e6595eb1dd2313e4d46b7d38b4b6ff68 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 15:55:56 -0600 Subject: [PATCH 12/25] Parallelize both main loops in srad --- juno_samples/rodinia/srad/src/cpu.sch | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 7b7a6c9e..8917f03d 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -40,10 +40,15 @@ let split = fork-split(loop2); let loop2_body = outline(split.srad_1.fj1); simpl!(loop2, loop2_body); -inline(srad@loop2); +fork-tile[32, 0, false, false](loop3); +let split = fork-split(loop3); +let loop3_body = outline(split.srad_2.fj1); +simpl!(loop3, loop3_body); + +inline(srad@loop2, srad@loop3); delete-uncalled(*); -fork-split(extract, compress, loop1, loop2_body, loop3); -unforkify(extract, compress, loop1, loop2_body, loop3); +fork-split(extract, compress, loop1, loop2_body, loop3_body); +unforkify(extract, compress, loop1, loop2_body, loop3_body); gcm(*); -- GitLab From b156fad632e6d0a7746b81c8bcf9e9dda0ef9977 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 15:56:23 -0600 Subject: [PATCH 13/25] Parallelize backprop --- juno_samples/rodinia/backprop/src/backprop.jn | 4 +-- juno_samples/rodinia/backprop/src/cpu.sch | 27 +++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index 2ca57c9f..70894c17 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -69,8 +69,8 @@ fn adjust_weights<n, m: usize>( weights: f32[n + 1, m + 1], prev_weights: f32[n + 1, m + 1] ) -> f32[n + 1, m + 1], f32[n + 1, m + 1] { - for j in 1..=m { - for k in 0..=n { + @outer_loop for j in 1..=m { + @inner_loop for k in 0..=n { let new_dw = ETA * delta[j] * vals[k] + MOMENTUM * prev_weights[k, j]; weights[k, j] += new_dw; prev_weights[k, j] = new_dw; diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 661ec531..865cc1a2 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -28,6 +28,29 @@ simpl!(*); fork-interchange[0, 1](adjust_weights); simpl!(*); -fork-split(*); -unforkify(*); +infer-schedules(*); + +fork-tile[32, 0, false, true](layer_forward@outer_loop \ layer_forward@inner_loop); +let (forward_outer, forward_inner) = fork-reshape[[1], [0]](layer_forward@outer_loop \ layer_forward@inner_loop); + +fork-tile[32, 0, false, true](adjust_weights); +let (adjust_outer, adjust_inner) = fork-reshape[[1], [0, 2]](adjust_weights); + +let forward_body = outline(forward_inner); +let adjust_body = outline(adjust_inner); + +rename["output_error"](output_error); +rename["hidden_error"](hidden_error); + +let output_error_body = auto-outline(output_error).output_error; +let hidden_error_body = auto-outline(hidden_error).hidden_error; + +inline(backprop); +delete-uncalled(*); +const-inline(*); + +simpl!(*); +fork-split(forward_body, adjust_body, output_error_body, hidden_error_body); +unforkify(forward_body, adjust_body, output_error_body, hidden_error_body); + gcm(*); -- GitLab From 17d338421b68e66c3c6e784a6a2e3ea901373383 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 16:11:19 -0600 Subject: [PATCH 14/25] Unparallelize backprop --- juno_samples/rodinia/backprop/src/cpu.sch | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 865cc1a2..6899523e 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -30,27 +30,11 @@ simpl!(*); infer-schedules(*); -fork-tile[32, 0, false, true](layer_forward@outer_loop \ layer_forward@inner_loop); -let (forward_outer, forward_inner) = fork-reshape[[1], [0]](layer_forward@outer_loop \ layer_forward@inner_loop); - -fork-tile[32, 0, false, true](adjust_weights); -let (adjust_outer, adjust_inner) = fork-reshape[[1], [0, 2]](adjust_weights); - -let forward_body = outline(forward_inner); -let adjust_body = outline(adjust_inner); - -rename["output_error"](output_error); -rename["hidden_error"](hidden_error); - -let output_error_body = auto-outline(output_error).output_error; -let hidden_error_body = auto-outline(hidden_error).hidden_error; - -inline(backprop); delete-uncalled(*); const-inline(*); simpl!(*); -fork-split(forward_body, adjust_body, output_error_body, hidden_error_body); -unforkify(forward_body, adjust_body, output_error_body, hidden_error_body); +fork-split(*); +unforkify(*); gcm(*); -- GitLab From be6c3a90a2d3f35f251915e216c81bb9a85f2449 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 16:36:31 -0600 Subject: [PATCH 15/25] New backprop schedule --- juno_samples/rodinia/backprop/src/backprop.jn | 8 ++++---- juno_samples/rodinia/backprop/src/cpu.sch | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index 70894c17..7851cf47 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -91,15 +91,15 @@ fn backprop<input_n, hidden_n, output_n: usize>( ) -> f32, f32, f32[input_n + 1, hidden_n + 1], f32[input_n + 1, hidden_n + 1], f32[hidden_n + 1, output_n + 1], f32[hidden_n + 1, output_n + 1] { - let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights); - let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights); + @forward_input let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights); + @forward_hidden let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights); @output_error let out_err, out_delta = output_error::<output_n>(target, output_vals); @hidden_error let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals); - let hidden_weights, hidden_prev_weights + @adjust_hidden let hidden_weights, hidden_prev_weights = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights); - let input_weights, input_prev_weights + @adjust_input let input_weights, input_prev_weights = adjust_weights::<input_n, hidden_n>(hid_delta, input_vals, input_weights, input_prev_weights); return out_err, hid_err, input_weights, input_prev_weights, hidden_weights, hidden_prev_weights; diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 6899523e..9a5f4d75 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -30,11 +30,24 @@ simpl!(*); infer-schedules(*); +// The first call to layer_forward can be parallelized by 16 (the size of the +// hidden layer) and the second can't be parallelized at all (the size of the +// output layer is 1) +inline(backprop@forward_input, backprop@forward_hidden); +let forward_input = outline(backprop@forward_input); +let forward_hidden = outline(backprop@forward_hidden); + +fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); +let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); +let forward_input = outline(inner); +inline(backprop@forward_input); + delete-uncalled(*); const-inline(*); simpl!(*); fork-split(*); -unforkify(*); +unforkify(output_error, hidden_error, adjust_weights, forward_hidden, forward_input); +simpl!(*); gcm(*); -- GitLab From 634e17e9bf1aa169b384c06d0f7fb3591a0afa10 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 16:39:40 -0600 Subject: [PATCH 16/25] More parallelism for backprop --- juno_samples/rodinia/backprop/src/cpu.sch | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 9a5f4d75..3c7f7d5f 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -42,12 +42,25 @@ let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_i let forward_input = outline(inner); inline(backprop@forward_input); +// The first call to adjust_weights has total loop dimensions of 1 * 17, so not +// worth parallelizing (given that the body is trivial) +// The second call to adjust_weights has a total dimension of 16 * (input + 1) +// which is worth parallelizing, we'll do it by 16 +inline(backprop@adjust_hidden, backprop@adjust_input); +let adjust_hidden = outline(backprop@adjust_hidden); +let adjust_input = outline(backprop@adjust_input); + +fork-tile[16, 0, false, true](adjust_input); +let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); +let adjust_input = outline(inner); +inline(backprop@adjust_input); + delete-uncalled(*); const-inline(*); simpl!(*); fork-split(*); -unforkify(output_error, hidden_error, adjust_weights, forward_hidden, forward_input); +unforkify(output_error, hidden_error, adjust_hidden, adjust_input, forward_hidden, forward_input); simpl!(*); gcm(*); -- GitLab From 53b31022651e397dbae56491c5b9447399b9af42 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 16:41:00 -0600 Subject: [PATCH 17/25] Broken bfs schedule --- juno_samples/rodinia/bfs/src/bfs.jn | 2 +- juno_samples/rodinia/bfs/src/cpu.sch | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index f82d9d80..d6ec25f2 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -53,7 +53,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] mask[i] = true; visited[i] = true; updated[i] = false; - stop_prod.stop = updated[i]; + stop_prod.stop = updated[i]; } } stop = stop_prod.stop; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 589b93b1..55d25095 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -37,5 +37,21 @@ fixpoint { } simpl!(collect); -unforkify(init, traverse, collect); +parallel-fork(traverse, collect); +parallel-reduce(traverse, collect); + +fork-tile[32, 0, false, true](init, traverse, collect); +let (outer, inner) = fork-reshape[[1], [0]](init); +let init_body = outline(inner); +let (outer, inner) = fork-reshape[[1], [0]](traverse); +let traverse_body = outline(inner); +let (outer, inner) = fork-reshape[[1], [0]](collect); +let collect_body = outline(inner); + +inline(bfs); +delete-uncalled(*); +const-inline(*); +simpl!(*); + +unforkify(init_body, traverse_body, collect_body); gcm(*); -- GitLab From bcd279ad741b348012a3fa7461957411d8b52e43 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Mon, 3 Mar 2025 17:19:36 -0600 Subject: [PATCH 18/25] Fix bfs --- juno_samples/rodinia/bfs/src/cpu.sch | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 55d25095..e7d88a29 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -48,10 +48,8 @@ let traverse_body = outline(inner); let (outer, inner) = fork-reshape[[1], [0]](collect); let collect_body = outline(inner); -inline(bfs); -delete-uncalled(*); const-inline(*); simpl!(*); unforkify(init_body, traverse_body, collect_body); -gcm(*); +gcm(*); \ No newline at end of file -- GitLab From 085d0a3a11b1ea232af70b03c58f17c0e28cb42e Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 17:48:16 -0600 Subject: [PATCH 19/25] Parallel BFS schedule --- juno_samples/rodinia/bfs/src/cpu.sch | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index e7d88a29..648e2be9 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -48,8 +48,11 @@ let traverse_body = outline(inner); let (outer, inner) = fork-reshape[[1], [0]](collect); let collect_body = outline(inner); +inline(bfs@cost_init, bfs@loop1, bfs@loop2); +delete-uncalled(*); const-inline(*); -simpl!(*); unforkify(init_body, traverse_body, collect_body); -gcm(*); \ No newline at end of file +simpl!(*); +gcm(*); +xdot[true](bfs); -- GitLab From 31523c2ea072b90a56b521ac33455a0ec0f34ed0 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 17:58:19 -0600 Subject: [PATCH 20/25] Parallel bfs --- juno_samples/rodinia/bfs/src/cpu.sch | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 648e2be9..e5100561 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -40,19 +40,23 @@ simpl!(collect); parallel-fork(traverse, collect); parallel-reduce(traverse, collect); -fork-tile[32, 0, false, true](init, traverse, collect); -let (outer, inner) = fork-reshape[[1], [0]](init); -let init_body = outline(inner); +fork-tile[32, 0, false, true](traverse, collect); let (outer, inner) = fork-reshape[[1], [0]](traverse); let traverse_body = outline(inner); let (outer, inner) = fork-reshape[[1], [0]](collect); let collect_body = outline(inner); -inline(bfs@cost_init, bfs@loop1, bfs@loop2); +let init_body = init; +// Following code seems to generate breaking RT code +//fork-tile[32, 0, false, true](init); +//let (outer, inner) = fork-reshape[[1], [0]](init); +//let init_body = outline(inner); +//inline(bfs@cost_init); + +inline(bfs@loop1, bfs@loop2); delete-uncalled(*); const-inline(*); unforkify(init_body, traverse_body, collect_body); simpl!(*); gcm(*); -xdot[true](bfs); -- GitLab From f5fe6b01eec57eeb770cfa273aebcf49d7cd0166 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 3 Mar 2025 22:51:55 -0600 Subject: [PATCH 21/25] Larger rodinia benches --- .../rodinia/backprop/benches/backprop_bench.rs | 4 ++-- juno_samples/rodinia/bfs/benches/bfs_bench.rs | 13 +++++++++++++ juno_samples/rodinia/cfd/benches/cfd_bench.rs | 13 +++++++------ juno_samples/rodinia/srad/benches/srad_bench.rs | 9 +++++---- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 17bdf6a7..98d582b5 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -18,7 +18,7 @@ fn backprop_bench(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(7); - let input_n = 65536; + let input_n = 33554432; let hidden_n = 16; let output_n = 1; @@ -46,7 +46,7 @@ fn backprop_bench(c: &mut Criterion) { let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec()); let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec()); - group.bench_function("backprop bench", |b| { + group.bench_function("backprop bench large", |b| { b.iter(|| { async_std::task::block_on(async { r.run( diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs index bf39a0fc..dbe6db38 100644 --- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs +++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs @@ -9,6 +9,7 @@ use juno_bfs::graph_parser::*; fn bfs_bench(c: &mut Criterion) { let mut group = c.benchmark_group("bfs bench"); + group.sample_size(10); let mut r = runner!(bfs); @@ -35,6 +36,18 @@ fn bfs_bench(c: &mut Criterion) { async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); }) }); + + let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; + let (nodes, source, edges) = parse_graph(input.into()); + let n = nodes.len() as u64; + let m = edges.len() as u64; + let nodes = HerculesImmBox::from(&nodes as &[Node]); + let edges = HerculesImmBox::from(&edges as &[u32]); + group.bench_function("bfs bench 64M", |b| { + b.iter(|| { + async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); + }) + }); } criterion_group!(benches, bfs_bench); diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index 5fc73db9..962c5249 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -10,10 +10,11 @@ use juno_cfd::*; fn cfd_bench(c: &mut Criterion) { let mut group = c.benchmark_group("cfd bench"); + group.sample_size(10); let mut r = runner!(euler); - let data_file = "data/fvcorr.domn.097K".to_string(); - let iterations = 1; + let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string(); + let iterations = 150; let block_size = 16; let FarFieldConditions { ff_variable, @@ -44,7 +45,7 @@ fn cfd_bench(c: &mut Criterion) { let normals_y = HerculesImmBox::from(normals.y.as_slice()); let normals_z = HerculesImmBox::from(normals.z.as_slice()); - group.bench_function("cfd bench euler", |b| { + group.bench_function("cfd bench euler large", |b| { b.iter(|| { async_std::task::block_on(async { r.run( @@ -84,8 +85,8 @@ fn cfd_bench(c: &mut Criterion) { }); let mut r = runner!(pre_euler); - let data_file = "data/fvcorr.domn.097K".to_string(); - let iterations = 1; + let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string(); + let iterations = 150; let block_size = 16; let FarFieldConditions { ff_variable, @@ -116,7 +117,7 @@ fn cfd_bench(c: &mut Criterion) { let normals_y = HerculesImmBox::from(normals.y.as_slice()); let normals_z = HerculesImmBox::from(normals.z.as_slice()); - group.bench_function("cfd bench pre-euler", |b| { + group.bench_function("cfd bench pre-euler large", |b| { b.iter(|| { async_std::task::block_on(async { r.run( diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index 6af13aae..23dc0643 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -9,12 +9,13 @@ use juno_srad::*; fn srad_bench(c: &mut Criterion) { let mut group = c.benchmark_group("srad bench"); + group.sample_size(10); let mut r = runner!(srad); - let niter = 100; + let niter = 30; let lambda = 0.5; - let nrows = 512; - let ncols = 512; + let nrows = 2048; + let ncols = 2048; let image = "data/image.pgm".to_string(); let Image { image: image_ori, @@ -24,7 +25,7 @@ fn srad_bench(c: &mut Criterion) { } = read_graphics(image); let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let mut image_h = HerculesMutBox::from(image.clone()); - group.bench_function("srad bench", |b| { + group.bench_function("srad bench large", |b| { b.iter(|| { async_std::task::block_on(async { r.run( -- GitLab From cf91c1033bd3c6ed71e46ad93998daba178c307d Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 09:35:43 -0600 Subject: [PATCH 22/25] Create two benches in backprop, one old size, one new size --- .../backprop/benches/backprop_bench.rs | 87 ++++++++++--------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 98d582b5..492bce3b 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -9,7 +9,8 @@ juno_build::juno!("backprop"); // We need this even though we don't use anything from the library because of // Rust build scripts only linking static libraries into the library, and not -// into the benchmark binary. Ugh! +// into the benchmark binary. Yuck! +#[allow(unused_imports)] use juno_backprop::*; fn backprop_bench(c: &mut Criterion) { @@ -18,52 +19,56 @@ fn backprop_bench(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(7); - let input_n = 33554432; - let hidden_n = 16; - let output_n = 1; + let mut bench = |name, input_n: usize| { + let hidden_n = 16; + let output_n = 1; - let mut input_vals = vec![0.0f32; input_n + 1]; - input_vals[0] = 1.0; + let mut input_vals = vec![0.0f32; input_n + 1]; + input_vals[0] = 1.0; - // For some reason the bpnn_randomize_row function used on target just sets it to 0.1 - let target = vec![0.1f32; output_n + 1]; + // For some reason the bpnn_randomize_row function used on target just sets it to 0.1 + let target = vec![0.1f32; output_n + 1]; - let input_weights = (0..(input_n + 1) * (hidden_n + 1)) - .map(|_| rng.random::<f32>()) - .collect::<Vec<_>>(); - let hidden_weights = (0..(hidden_n + 1) * (output_n + 1)) - .map(|_| rng.random::<f32>()) - .collect::<Vec<_>>(); + let input_weights = (0..(input_n + 1) * (hidden_n + 1)) + .map(|_| rng.random::<f32>()) + .collect::<Vec<_>>(); + let hidden_weights = (0..(hidden_n + 1) * (output_n + 1)) + .map(|_| rng.random::<f32>()) + .collect::<Vec<_>>(); - let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)]; - let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)]; + let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)]; + let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)]; - let mut r = runner!(backprop); - let input_vals = HerculesImmBox::from(&input_vals as &[f32]); - let target = HerculesImmBox::from(&target as &[f32]); - let mut input_weights = HerculesMutBox::from(input_weights.to_vec()); - let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec()); - let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec()); - let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec()); + let mut r = runner!(backprop); + let input_vals = HerculesImmBox::from(&input_vals as &[f32]); + let target = HerculesImmBox::from(&target as &[f32]); + let mut input_weights = HerculesMutBox::from(input_weights.to_vec()); + let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec()); + let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec()); + let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec()); - group.bench_function("backprop bench large", |b| { - b.iter(|| { - async_std::task::block_on(async { - r.run( - input_n as u64, - hidden_n as u64, - output_n as u64, - input_vals.to(), - input_weights.to(), - hidden_weights.to(), - target.to(), - input_prev_weights.to(), - hidden_prev_weights.to(), - ) - .await - }); - }) - }); + group.bench_function(name, |b| { + b.iter(|| { + async_std::task::block_on(async { + r.run( + input_n as u64, + hidden_n as u64, + output_n as u64, + input_vals.to(), + input_weights.to(), + hidden_weights.to(), + target.to(), + input_prev_weights.to(), + hidden_prev_weights.to(), + ) + .await + }); + }) + }); + }; + + bench("backprop bench small", 65536); + bench("backprop bench large", 33554432); } criterion_group!(benches, backprop_bench); -- GitLab From d5af9dea72cfa307940f6e1904032245c71d15f2 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 09:41:38 -0600 Subject: [PATCH 23/25] BFS bench add useful error messages --- juno_samples/rodinia/bfs/benches/bfs_bench.rs | 6 +++--- juno_samples/rodinia/bfs/src/graph_parser.rs | 12 +++++++----- juno_samples/rodinia/bfs/src/lib.rs | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs index dbe6db38..9c09c180 100644 --- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs +++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs @@ -14,7 +14,7 @@ fn bfs_bench(c: &mut Criterion) { let mut r = runner!(bfs); let input = "data/graph4096.txt"; - let (nodes, source, edges) = parse_graph(input.into()); + let (nodes, source, edges) = parse_graph(input.into()).unwrap(); let n = nodes.len() as u64; let m = edges.len() as u64; let nodes = HerculesImmBox::from(&nodes as &[Node]); @@ -26,7 +26,7 @@ fn bfs_bench(c: &mut Criterion) { }); let input = "data/graph65536.txt"; - let (nodes, source, edges) = parse_graph(input.into()); + let (nodes, source, edges) = parse_graph(input.into()).unwrap(); let n = nodes.len() as u64; let m = edges.len() as u64; let nodes = HerculesImmBox::from(&nodes as &[Node]); @@ -38,7 +38,7 @@ fn bfs_bench(c: &mut Criterion) { }); let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; - let (nodes, source, edges) = parse_graph(input.into()); + let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines."); let n = nodes.len() as u64; let m = edges.len() as u64; let nodes = HerculesImmBox::from(&nodes as &[Node]); diff --git a/juno_samples/rodinia/bfs/src/graph_parser.rs b/juno_samples/rodinia/bfs/src/graph_parser.rs index fecd2a3e..f761d8ea 100644 --- a/juno_samples/rodinia/bfs/src/graph_parser.rs +++ b/juno_samples/rodinia/bfs/src/graph_parser.rs @@ -11,16 +11,18 @@ pub struct Node { pub num_edges: u32, } -pub fn parse_graph(file: String) -> (Vec<Node>, u32, Vec<u32>) { - let mut file = File::open(file).expect("Error opening input file"); +pub fn parse_graph(file: String) -> Result<(Vec<Node>, u32, Vec<u32>), String> { + let mut file = File::open(file).map_err(|err| format!("Error opening input file: {}", err))?; let mut contents = String::new(); file.read_to_string(&mut contents) - .expect("Error reading input file"); + .map_err(|err| format!("Error reading input file: {}", err))?; let mut parser = nom::combinator::all_consuming(graph_parser); - let (_, result) = parser.parse(&contents).expect("Parser error"); + let (_, result) = parser + .parse(&contents) + .map_err(|err| format!("Parsing error: {}", err))?; - result + Ok(result) } fn graph_parser<'a>(text: &'a str) -> nom::IResult<&'a str, (Vec<Node>, u32, Vec<u32>)> { diff --git a/juno_samples/rodinia/bfs/src/lib.rs b/juno_samples/rodinia/bfs/src/lib.rs index f6403540..e48dfe22 100644 --- a/juno_samples/rodinia/bfs/src/lib.rs +++ b/juno_samples/rodinia/bfs/src/lib.rs @@ -36,7 +36,7 @@ fn run_bfs(nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> { pub fn bfs_harness(args: BFSInputs) { let BFSInputs { input } = args; - let (nodes, source, edges) = parse_graph(input); + let (nodes, source, edges) = parse_graph(input).unwrap(); let costs_juno = run_bfs(&nodes, source, &edges); let costs_ref = rust_bfs::bfs(&nodes, source, &edges); -- GitLab From 65a5416a9e3989502542e03d4e7a4ff867008772 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 09:56:13 -0600 Subject: [PATCH 24/25] cleanup cfd bench --- juno_samples/rodinia/bfs/benches/bfs_bench.rs | 36 +-- juno_samples/rodinia/cfd/benches/cfd_bench.rs | 280 +++++++++--------- juno_samples/rodinia/cfd/src/lib.rs | 2 +- juno_samples/rodinia/cfd/src/setup.rs | 30 +- 4 files changed, 185 insertions(+), 163 deletions(-) diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs index 9c09c180..f87b4e09 100644 --- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs +++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs @@ -13,37 +13,37 @@ fn bfs_bench(c: &mut Criterion) { let mut r = runner!(bfs); - let input = "data/graph4096.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); group.bench_function("bfs bench 4096", |b| { + let input = "data/graph4096.txt"; + let (nodes, source, edges) = parse_graph(input.into()).unwrap(); + let n = nodes.len() as u64; + let m = edges.len() as u64; + let nodes = HerculesImmBox::from(&nodes as &[Node]); + let edges = HerculesImmBox::from(&edges as &[u32]); b.iter(|| { async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); }) }); - let input = "data/graph65536.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); group.bench_function("bfs bench 65536", |b| { + let input = "data/graph65536.txt"; + let (nodes, source, edges) = parse_graph(input.into()).unwrap(); + let n = nodes.len() as u64; + let m = edges.len() as u64; + let nodes = HerculesImmBox::from(&nodes as &[Node]); + let edges = HerculesImmBox::from(&edges as &[u32]); b.iter(|| { async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); }) }); - let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; - let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines."); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); group.bench_function("bfs bench 64M", |b| { + let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; + let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines."); + let n = nodes.len() as u64; + let m = edges.len() as u64; + let nodes = HerculesImmBox::from(&nodes as &[Node]); + let edges = HerculesImmBox::from(&edges as &[u32]); b.iter(|| { async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); }) diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index 962c5249..aa6d7727 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -12,149 +12,161 @@ fn cfd_bench(c: &mut Criterion) { let mut group = c.benchmark_group("cfd bench"); group.sample_size(10); - let mut r = runner!(euler); - let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string(); - let iterations = 150; - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut euler_bench = |name, data_file, iterations| { + group.bench_function(name, |b| { + let mut r = runner!(euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); - group.bench_function("cfd bench euler large", |b| { - b.iter(|| { - async_std::task::block_on(async { - r.run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); - }) - }); + b.iter(|| { + async_std::task::block_on(async { + r.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ) + .await + }); + }) + }); + }; + euler_bench("cfd bench euler small", "data/fvcorr.domn.097K", 1); + euler_bench( + "cfd bench euler large", + "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M", + 150, + ); - let mut r = runner!(pre_euler); - let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string(); - let iterations = 150; - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut pre_euler_bench = |name, data_file, iterations| { + group.bench_function(name, |b| { + let mut r = runner!(pre_euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); - group.bench_function("cfd bench pre-euler large", |b| { - b.iter(|| { - async_std::task::block_on(async { - r.run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); - }) - }); + b.iter(|| { + async_std::task::block_on(async { + r.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ) + .await + }); + }) + }); + }; + pre_euler_bench("cfd bench pre-euler small", "data/fvcorr.domn.097K", 1); + pre_euler_bench( + "cfd bench pre-euler large", + "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M", + 150, + ); } criterion_group!(benches, cfd_bench); diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index a9800ed0..3b1e21e6 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -237,7 +237,7 @@ pub fn cfd_harness(args: CFDInputs) { areas, elements_surrounding_elements, normals, - } = read_domain_geometry(data_file, block_size); + } = read_domain_geometry(data_file, block_size).unwrap(); let variables = initialize_variables(nelr, &ff_variable); println!("Running CFD with nelr = {}.", nelr); diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs index 0da00251..669810d6 100644 --- a/juno_samples/rodinia/cfd/src/setup.rs +++ b/juno_samples/rodinia/cfd/src/setup.rs @@ -29,7 +29,7 @@ pub struct AlignedSlice<T> { impl<T> Debug for AlignedSlice<T> where - T: Debug + T: Debug, { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { write!(f, "{:?}", self.as_slice()) @@ -127,7 +127,15 @@ pub struct FarFieldConditions { } pub fn set_far_field_conditions() -> FarFieldConditions { - let mut ff_variable = Variable { density: 0.0, momentum: Float3 { x: 0.0, y: 0.0, z: 0.0 }, energy: 0.0 }; + let mut ff_variable = Variable { + density: 0.0, + momentum: Float3 { + x: 0.0, + y: 0.0, + z: 0.0, + }, + energy: 0.0, + }; let angle_of_attack = std::f32::consts::PI / 180.0 * deg_angle_of_attack; @@ -186,19 +194,21 @@ pub struct GeometryData { pub normals: Normals, } -pub fn read_domain_geometry<P>(path: P, block_size: usize) -> GeometryData +pub fn read_domain_geometry<P>(path: P, block_size: usize) -> Result<GeometryData, String> where P: AsRef<Path>, { - let mut file = File::open(path).expect("Error opening input file"); + let mut file = File::open(path).map_err(|err| format!("Error opening input file: {}", err))?; let mut contents = String::new(); file.read_to_string(&mut contents) - .expect("Error reading input file"); + .map_err(|err| format!("Error reading input file: {}", err))?; let mut parser = nom::combinator::all_consuming(|s| cfd_parser(block_size, s)); - let (_, result) = parser.parse(&contents).expect("Parser error"); + let (_, result) = parser + .parse(&contents) + .map_err(|err| format!("Parser error: {}", err))?; - result + Ok(result) } fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, GeometryData> { @@ -240,17 +250,17 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo let val = i32::from_str(val).unwrap(); let val = if neg { -val } else { val }; elements_surrounding_elements[i + j * nelr] = if val < 0 { -1 } else { val } - 1; // it's coming in with Fortran numbering - + let t = nom::character::complete::multispace0(text)?.0; let (t, val) = nom::number::complete::float(t)?; text = t; normals.x[i + j * nelr] = -val; - + let t = nom::character::complete::multispace0(text)?.0; let (t, val) = nom::number::complete::float(t)?; text = t; normals.y[i + j * nelr] = -val; - + let t = nom::character::complete::multispace0(text)?.0; let (t, val) = nom::number::complete::float(t)?; text = t; -- GitLab From 4cf9d708a9dddef41b1519cfe0cdec3da10b8d93 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 09:58:06 -0600 Subject: [PATCH 25/25] cleanup srad bench --- .../rodinia/srad/benches/srad_bench.rs | 62 ++++++++++--------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index 23dc0643..bf0e4ad4 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -1,7 +1,7 @@ #![feature(concat_idents)] use criterion::{criterion_group, criterion_main, Criterion}; -use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo}; +use hercules_rt::{runner, HerculesMutBox, HerculesMutBoxTo}; juno_build::juno!("srad"); @@ -11,35 +11,37 @@ fn srad_bench(c: &mut Criterion) { let mut group = c.benchmark_group("srad bench"); group.sample_size(10); - let mut r = runner!(srad); - let niter = 30; - let lambda = 0.5; - let nrows = 2048; - let ncols = 2048; - let image = "data/image.pgm".to_string(); - let Image { - image: image_ori, - max, - rows: image_ori_rows, - cols: image_ori_cols, - } = read_graphics(image); - let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); - let mut image_h = HerculesMutBox::from(image.clone()); - group.bench_function("srad bench large", |b| { - b.iter(|| { - async_std::task::block_on(async { - r.run( - nrows as u64, - ncols as u64, - niter as u64, - image_h.to(), - max, - lambda, - ) - .await - }); - }) - }); + let mut bench = |name, niter, nrows, ncols| { + let mut r = runner!(srad); + let lambda = 0.5; + let image = "data/image.pgm".to_string(); + let Image { + image: image_ori, + max, + rows: image_ori_rows, + cols: image_ori_cols, + } = read_graphics(image); + let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); + let mut image_h = HerculesMutBox::from(image.clone()); + group.bench_function(name, |b| { + b.iter(|| { + async_std::task::block_on(async { + r.run( + nrows as u64, + ncols as u64, + niter as u64, + image_h.to(), + max, + lambda, + ) + .await + }); + }) + }); + }; + + bench("srad bench small", 100, 512, 512); + bench("srad bench large", 30, 2048, 2048); } criterion_group!(benches, srad_bench); -- GitLab