From 3f8acd3bb4d0b7388a74c7dc24e3d1e4ad9771b1 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 22:44:17 -0600 Subject: [PATCH 1/9] misc. progress for bfs gpu --- hercules_cg/src/rt.rs | 2 +- hercules_opt/src/gcm.rs | 70 ++++++++++++++++------------ juno_samples/rodinia/bfs/src/gpu.sch | 52 ++++++++++++++------- 3 files changed, 76 insertions(+), 48 deletions(-) diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 6981a3da..d94f0e19 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -938,7 +938,7 @@ impl<'a> RTContext<'a> { let dst_device = self.node_colors.0[&collect]; write!( block, - "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {});", + "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {} as usize);", src_device.name(), dst_device.name(), self.get_value(collect, bb, false), diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index c612acac..3f326051 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -1304,37 +1304,49 @@ enum UTerm { fn unify( mut equations: VecDeque<(UTerm, UTerm)>, -) -> Result<BTreeMap<NodeID, Device>, BTreeMap<NodeID, Device>> { +) -> Result<BTreeMap<NodeID, Device>, (NodeID, NodeID)> { let mut theta = BTreeMap::new(); + // First, assign devices to nodes when a rule directly says to. + for _ in 0..equations.len() { + let (l, r) = equations.pop_front().unwrap(); + match (l, r) { + (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => { + theta.insert(n, d); + } + _ => equations.push_back((l, r)), + } + } + + // Second, iterate the rest of the rules until... + // 1. The rules are exhausted. All the nodes have device assignments. + // 2. No progress is being made. Some nodes may not have device assignments. + // 3. An inconsistency has been found. The inconsistency is returned. let mut no_progress_iters = 0; while no_progress_iters <= equations.len() && let Some((l, r)) = equations.pop_front() { - match (l, r) { - (UTerm::Node(_), UTerm::Node(_)) => { - if l != r { - equations.push_back((l, r)); - } - no_progress_iters += 1; - } - (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => { - theta.insert(n, d); - for (l, r) in equations.iter_mut() { - if *l == UTerm::Node(n) { - *l = UTerm::Device(d); - } - if *r == UTerm::Node(n) { - *r = UTerm::Device(d); - } + let (UTerm::Node(l), UTerm::Node(r)) = (l, r) else { + panic!(); + }; + + match (theta.get(&l), theta.get(&r)) { + (Some(ld), Some(rd)) => { + if ld != rd { + return Err((l, r)); + } else { + no_progress_iters = 0; } - no_progress_iters = 0; } - (UTerm::Device(d1), UTerm::Device(d2)) if d1 == d2 => { + (Some(d), None) | (None, Some(d)) => { + let d = *d; + theta.insert(l, d); + theta.insert(r, d); no_progress_iters = 0; } - _ => { - return Err(theta); + (None, None) => { + equations.push_back((UTerm::Node(l), UTerm::Node(r))); + no_progress_iters += 1; } } } @@ -1377,8 +1389,8 @@ fn color_nodes( } if !editor.get_type(typing[id.idx()]).is_primitive() => { // Every input to a phi needs to be on the same device. The // phi itself is also on this device. - for (l, r) in zip(data.into_iter(), data.into_iter().skip(1).chain(once(&id))) { - equations.push((UTerm::Node(*l), UTerm::Node(*r))); + for data in data { + equations.push((UTerm::Node(*data), UTerm::Node(id))); } } Node::Reduce { @@ -1394,7 +1406,7 @@ fn color_nodes( } if !editor.get_type(typing[id.idx()]).is_primitive() => { // Every input to the reduce, and the reduce itself, are on // the same device. - equations.push((UTerm::Node(first), UTerm::Node(second))); + equations.push((UTerm::Node(first), UTerm::Node(id))); equations.push((UTerm::Node(second), UTerm::Node(id))); } Node::Constant { id: _ } @@ -1533,12 +1545,12 @@ fn color_nodes( } Some(func_colors) } - Err(progress) => { + Err(inconsistency) => { // If unification failed, then there's some node using a node in - // `progress` that's expecting a different type than what it got. - // Pick one and add potentially inter-device copies on each def-use - // edge. We'll clean these up later. - let (id, _) = progress.into_iter().next().unwrap(); + // that's expecting a different type than what it got. Pick one and + // add potentially inter-device copies on each def-use edge. We'll + // clean these up later. + let id = inconsistency.0; let users: Vec<_> = editor.get_users(id).collect(); let success = editor.edit(|mut edit| { let cons = edit.add_zero_constant(typing[id.idx()]); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 0a3f4d77..49d44b98 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -1,23 +1,39 @@ -gvn(*); -phi-elim(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} -let outline = auto-outline(bfs); -gpu(outline.bfs); +phi-elim(bfs); +no-memset(bfs@cost); +let cost_init = outline(bfs@cost_init); +let loop1 = outline(bfs@loop1); +let loop2 = outline(bfs@loop2); +gpu(cost_init, loop1, loop2); -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); +simpl!(*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); +predication(*); +simpl!(*); -//forkify(*); -infer-schedules(*); +parallel-reduce(loop1); +forkify(*); +fork-guard-elim(*); +simpl!(*); +predication(*); +reduce-slf(*); +simpl!(*); gcm(*); -fixpoint { - float-collections(*); - dce(*); - gcm(*); -} +xdot[true](*); -- GitLab From 2654f19895ca35203f8431d46613c5566e194e40 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 22:50:22 -0600 Subject: [PATCH 2/9] oh my --- juno_samples/rodinia/bfs/src/gpu.sch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 49d44b98..5be2d903 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -13,7 +13,7 @@ no-memset(bfs@cost); let cost_init = outline(bfs@cost_init); let loop1 = outline(bfs@loop1); let loop2 = outline(bfs@loop2); -gpu(cost_init, loop1, loop2); +gpu(loop1, loop2); simpl!(*); predication(*); @@ -35,5 +35,5 @@ predication(*); reduce-slf(*); simpl!(*); +unforkify(cost_init); gcm(*); -xdot[true](*); -- GitLab From ab1eafd464dd73b2e9f81633e42ede950766f600 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 09:52:12 -0600 Subject: [PATCH 3/9] fix unify --- hercules_opt/src/gcm.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index 3f326051..1dcedd97 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -1302,9 +1302,7 @@ enum UTerm { Device(Device), } -fn unify( - mut equations: VecDeque<(UTerm, UTerm)>, -) -> Result<BTreeMap<NodeID, Device>, (NodeID, NodeID)> { +fn unify(mut equations: VecDeque<(UTerm, UTerm)>) -> Result<BTreeMap<NodeID, Device>, NodeID> { let mut theta = BTreeMap::new(); // First, assign devices to nodes when a rule directly says to. @@ -1312,7 +1310,11 @@ fn unify( let (l, r) = equations.pop_front().unwrap(); match (l, r) { (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => { - theta.insert(n, d); + if let Some(old_d) = theta.insert(n, d) + && old_d != d + { + return Err(n); + } } _ => equations.push_back((l, r)), } @@ -1333,7 +1335,7 @@ fn unify( match (theta.get(&l), theta.get(&r)) { (Some(ld), Some(rd)) => { if ld != rd { - return Err((l, r)); + return Err(l); } else { no_progress_iters = 0; } @@ -1545,12 +1547,11 @@ fn color_nodes( } Some(func_colors) } - Err(inconsistency) => { + Err(id) => { // If unification failed, then there's some node using a node in - // that's expecting a different type than what it got. Pick one and - // add potentially inter-device copies on each def-use edge. We'll - // clean these up later. - let id = inconsistency.0; + // that's expecting a different type than what it got. Add + // potentially inter-device copies on each def-use edge. We'll clean + // these up later. let users: Vec<_> = editor.get_users(id).collect(); let success = editor.edit(|mut edit| { let cons = edit.add_zero_constant(typing[id.idx()]); -- GitLab From 96c8dc524fe30a271e57a2f9d02f3759411767ad Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 11:14:12 -0600 Subject: [PATCH 4/9] sequentializing channel loop in gamut in cava is beneficial --- juno_samples/cava/src/gpu.sch | 1 + 1 file changed, 1 insertion(+) diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index c8db124e..bacfd3ab 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -120,6 +120,7 @@ simpl!(fuse4); //fork-tile[2, 0, false, true](fuse4@channel_loop); //fork-split(fuse4@channel_loop); //clean-monoid-reduces(fuse4); +unforkify(fuse4@channel_loop); no-memset(fuse5@res1); no-memset(fuse5@res2); -- GitLab From 28ea01a1afcbb630b570e360693defb3e76c70d9 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 13:24:48 -0600 Subject: [PATCH 5/9] Infer more indices as parallel --- hercules_opt/src/utils.rs | 18 ++++++++++++++++++ juno_samples/rodinia/backprop/src/backprop.jn | 4 ++-- juno_samples/rodinia/backprop/src/cpu.sch | 10 +++------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index e962b81d..b910a128 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -532,6 +532,24 @@ where let fork_thread_id_pairs = node_indices(indices).filter_map(|id| { if let Node::ThreadID { control, dimension } = nodes[id.idx()] { Some((control, dimension)) + } else if let Node::Binary { + op: BinaryOperator::Add, + left: tid, + right: cons, + } = nodes[id.idx()] + && let Node::ThreadID { control, dimension } = nodes[tid.idx()] + && (nodes[cons.idx()].is_constant() || nodes[cons.idx()].is_dynamic_constant()) + { + Some((control, dimension)) + } else if let Node::Binary { + op: BinaryOperator::Add, + left: cons, + right: tid, + } = nodes[id.idx()] + && let Node::ThreadID { control, dimension } = nodes[tid.idx()] + && (nodes[cons.idx()].is_constant() || nodes[cons.idx()].is_dynamic_constant()) + { + Some((control, dimension)) } else { None } diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index 356bb3d9..94c4334c 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -7,9 +7,9 @@ fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f @res let result : f32[m + 1]; result[0] = 1.0; - for j in 1..=m { + @outer_loop for j in 1..=m { let sum = 0.0; - for k in 0..=n { + @inner_loop for k in 0..=n { sum += weights[k, j] * vals[k]; } result[j] = squash(sum); diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index d1fe8953..d59fd5f5 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -15,20 +15,16 @@ delete-uncalled(*); no-memset(layer_forward@res); lift-dc-math(*); loop-bound-canon(*); -dce(*); +simpl!(*); lift-dc-math(*); +slf(*); fixpoint { forkify(*); fork-guard-elim(*); fork-coalesce(*); } +simpl!(*); fork-split(*); -gvn(*); -phi-elim(*); -dce(*); unforkify(*); -gvn(*); -phi-elim(*); -dce(*); gcm(*); -- GitLab From 3565d96d31fa97f32892548167b359843fafb6dc Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 14:28:50 -0600 Subject: [PATCH 6/9] Tweak edge multi-core --- juno_samples/edge_detection/src/cpu.sch | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 4bd3254b..ec9e423d 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -86,7 +86,7 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[8, 0, false, false](max_gradient); +fork-tile[16, 0, false, false](max_gradient); let split = fork-split(max_gradient); clean-monoid-reduces(max_gradient); let out = outline(split._4_max_gradient.fj1); @@ -104,11 +104,18 @@ fixpoint { } predication(reject_zero_crossings); simpl!(reject_zero_crossings); +fork-tile[4, 1, false, false](reject_zero_crossings); +fork-tile[4, 0, false, false](reject_zero_crossings); +fork-interchange[1, 2](reject_zero_crossings); +let split = fork-split(reject_zero_crossings); +let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); +fork-coalesce(reject_zero_crossings, reject_zero_crossings_body); +simpl!(reject_zero_crossings, reject_zero_crossings_body); async-call(edge_detection@le, edge_detection@zc); -fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings); -unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings); +fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); +unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); simpl!(*); -- GitLab From da2cf7557ad1b74556df070c76d16094b9f20256 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 14:45:40 -0600 Subject: [PATCH 7/9] gcm moment to allow reduce-slf in CPU code without worry --- hercules_opt/src/gcm.rs | 7 +++++++ juno_samples/rodinia/srad/src/cpu.sch | 2 ++ 2 files changed, 9 insertions(+) diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index 1dcedd97..2579e73e 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -883,6 +883,13 @@ fn spill_clones( init == *b && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) }) + .unwrap_or(false) + && !editor.func().nodes[a.idx()] + .try_phi() + .map(|(_, data)| { + data.contains(b) + && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) + }) .unwrap_or(false)) }); diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 2b45e8c9..a4cd4956 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -29,6 +29,8 @@ fixpoint { } simpl!(*); fork-interchange[0, 1](loop1); +reduce-slf(*); +simpl!(*); fork-split(*); unforkify(*); -- GitLab From 1b78bc2e9d2acce3e9b1eec8aed25489eb9aced5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 15:59:10 -0600 Subject: [PATCH 8/9] Refactor infer parallelreduce --- hercules_opt/src/schedule.rs | 46 ++++++----------------- juno_samples/rodinia/backprop/src/cpu.sch | 1 + 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs index 9bc7823e..10eca72e 100644 --- a/hercules_opt/src/schedule.rs +++ b/hercules_opt/src/schedule.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; +use std::iter::once; -use hercules_ir::def_use::*; use hercules_ir::ir::*; use crate::*; @@ -42,6 +42,10 @@ pub fn infer_parallel_reduce( fork_join_map: &HashMap<NodeID, NodeID>, reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>, ) { + let join_fork_map: HashMap<_, _> = fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); for id in editor.node_ids() { let func = editor.func(); if !func.nodes[id.idx()].is_reduce() { @@ -98,40 +102,11 @@ pub fn infer_parallel_reduce( && *collect == last_reduce && !reduce_cycles[&last_reduce].contains(data) { - // If there is a Write-Reduce tight cycle, get the position indices. - let positions = indices - .iter() - .filter_map(|index| { - if let Index::Position(indices) = index { - Some(indices) - } else { - None - } - }) - .flat_map(|pos| pos.iter()); - - // Get the Forks corresponding to uses of bare ThreadIDs. - let fork_thread_id_pairs = positions.filter_map(|id| { - if let Node::ThreadID { control, dimension } = func.nodes[id.idx()] { - Some((control, dimension)) - } else { - None - } - }); - let mut forks = HashMap::<NodeID, Vec<usize>>::new(); - for (fork, dim) in fork_thread_id_pairs { - forks.entry(fork).or_default().push(dim); - } - - // Check if one of the Forks correspond to the Join associated with - // the Reduce being considered, and has all of its dimensions - // represented in the indexing. - let is_parallel = forks.into_iter().any(|(id, mut rep_dims)| { - rep_dims.sort(); - rep_dims.dedup(); - fork_join_map[&id] == first_control.unwrap() - && func.nodes[id.idx()].try_fork().unwrap().1.len() == rep_dims.len() - }); + let is_parallel = indices_parallel_over_forks( + editor, + indices, + once(join_fork_map[&first_control.unwrap()]), + ); if is_parallel { editor.edit(|edit| edit.add_schedule(id, Schedule::ParallelReduce)); @@ -145,6 +120,7 @@ pub fn infer_parallel_reduce( * operands must be the Reduce node, and all other operands must not be in the * Reduce node's cycle. */ +#[rustfmt::skip] pub fn infer_monoid_reduce( editor: &mut FunctionEditor, reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>, diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index d59fd5f5..de34d660 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -23,6 +23,7 @@ fixpoint { fork-guard-elim(*); fork-coalesce(*); } +reduce-slf(*); simpl!(*); fork-split(*); -- GitLab From eaf09eb1b58e1d159839aa0c4b474e7d62d5dd09 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 28 Feb 2025 16:31:57 -0600 Subject: [PATCH 9/9] hmm --- hercules_opt/src/gcm.rs | 4 ++-- juno_samples/rodinia/bfs/src/gpu.sch | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index 2579e73e..d950941a 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -879,8 +879,8 @@ fn spill_clones( || editor.func().nodes[a.idx()].is_reduce()) && !editor.func().nodes[a.idx()] .try_reduce() - .map(|(_, init, _)| { - init == *b + .map(|(_, init, reduct)| { + (init == *b || reduct == *b) && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) }) .unwrap_or(false) diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 5be2d903..6c4d027b 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -27,6 +27,7 @@ simpl!(*); predication(*); simpl!(*); +unforkify(cost_init); parallel-reduce(loop1); forkify(*); fork-guard-elim(*); @@ -35,5 +36,4 @@ predication(*); reduce-slf(*); simpl!(*); -unforkify(cost_init); gcm(*); -- GitLab