diff --git a/.gitignore b/.gitignore index 516108ddfe2c09b0e5d556e6b4ca96c8a9b3a72d..6298d7bbc1b72492a0d1da0892a78ff2c3f13b85 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/target +**/target *.dot !paper_resources/*.dot *.bc @@ -14,3 +14,4 @@ .vscode *_env *.txt +*ncu-rep \ No newline at end of file diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; // because Fork basic block's init section already does gating write!( w, - "{}{} = (threadIdx.x % {}) / {};\n", + "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n", tabs, define_variable, use_thread_quota.unwrap(), - use_thread_quota.unwrap() / parallel_factor.unwrap() + use_thread_quota.unwrap() / parallel_factor.unwrap(), + divide, + modulo, )?; } } diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs index 2a3f9fb1aa86dd092d048bf79b51aa118d2489b8..b2567b8f1313e97aeb70268b11847b6d70b1a2f5 100644 --- a/hercules_ir/src/typecheck.rs +++ b/hercules_ir/src/typecheck.rs @@ -716,8 +716,10 @@ fn typeflow( // Check number of run-time arguments. if inputs.len() - 1 != callee.param_types.len() { return Error(format!( - "Call node has {} inputs, but calls a function with {} parameters.", + "Call node in {} has {} inputs, but calls a function ({}) with {} parameters.", + function.name, inputs.len() - 1, + callee.name, callee.param_types.len(), )); } @@ -725,8 +727,10 @@ fn typeflow( // Check number of dynamic constant arguments. if dc_args.len() != callee.num_dynamic_constants as usize { return Error(format!( - "Call node references {} dynamic constants, but calls a function expecting {} dynamic constants.", + "Call node in {} references {} dynamic constants, but calls a function ({}) expecting {} dynamic constants.", + function.name, dc_args.len(), + callee.name, callee.num_dynamic_constants )); } diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e6db0345def31324243cdee2bdcb6b5cca5d9a7b..8bd3f735fa60cf4e0404edde28eb858e0ed7e581 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Rem, }); edit.sub_edit(tid, rem); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, rem)?; } else if tid_dim == inner_idx { let outer_tid = Node::ThreadID { @@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Div, }); edit.sub_edit(tid, div); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, div)?; } } @@ -1479,7 +1481,12 @@ fn fork_fusion( } // Perform the fusion. + let bottom_tids: Vec<_> = editor + .get_users(bottom_fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); editor.edit(|mut edit| { + edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?; if bottom_join_pred != bottom_fork { // If there is control flow in the bottom fork-join, stitch it into // the top fork-join. diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs index ad4ce19ede72c3fd373e67f1a51f46de00c47241..a01b0f5553ef59aaf45fe6ccc2c3ee7bdc881a0b 100644 --- a/hercules_opt/src/interprocedural_sroa.rs +++ b/hercules_opt/src/interprocedural_sroa.rs @@ -1,6 +1,3 @@ -use std::collections::HashMap; -use std::iter::zip; - use hercules_ir::ir::*; use crate::*; @@ -39,15 +36,30 @@ pub fn interprocedural_sroa( } let editor: &mut FunctionEditor = &mut editors[func_id.idx()]; + let param_types = &editor.func().param_types.to_vec(); let return_types = &editor.func().return_types.to_vec(); - // We determine the new return types of the function and track a map - // that tells us how the old return values are constructed from the - // new ones + // We determine the new param/return types of the function and track a + // map that tells us how the old param/return values are constructed + // from the new ones. + let mut new_param_types = vec![]; + let mut old_param_type_map = vec![]; let mut new_return_types = vec![]; let mut old_return_type_map = vec![]; let mut changed = false; + for par_typ in param_types.iter() { + if !can_sroa_type(editor, *par_typ) { + old_param_type_map.push(IndexTree::Leaf(new_param_types.len())); + new_param_types.push(*par_typ); + } else { + let (types, index) = sroa_type(editor, *par_typ, new_param_types.len()); + old_param_type_map.push(index); + new_param_types.extend(types); + changed = true; + } + } + for ret_typ in return_types.iter() { if !can_sroa_type(editor, *ret_typ) { old_return_type_map.push(IndexTree::Leaf(new_return_types.len())); @@ -60,25 +72,54 @@ pub fn interprocedural_sroa( } } - // If the return type is not changed by IP SROA, skip to the next function + // If the param/return types aren't changed by IP SROA, skip to the next + // function. if !changed { continue; } - // Now, modify each return in the current function and the return type - let return_nodes = editor - .func() - .nodes - .iter() - .enumerate() - .filter_map(|(idx, node)| { - if node.try_return().is_some() { - Some(NodeID::new(idx)) + // Modify each parameter in the current function and the param types. + let mut param_nodes: Vec<_> = vec![vec![]; param_types.len()]; + for id in editor.node_ids() { + if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() { + param_nodes[idx].push(id); + } + } + let success = editor.edit(|mut edit| { + for (idx, ids) in param_nodes.into_iter().enumerate() { + let new_indices = &old_param_type_map[idx]; + let built = if let IndexTree::Leaf(new_idx) = new_indices { + edit.add_node(Node::Parameter { index: *new_idx }) } else { - None + let prod_ty = param_types[idx]; + let cons = edit.add_zero_constant(prod_ty); + let mut cons = edit.add_node(Node::Constant { id: cons }); + new_indices.for_each(|idx: &Vec<Index>, param_idx: &usize| { + let param = edit.add_node(Node::Parameter { index: *param_idx }); + cons = edit.add_node(Node::Write { + collect: cons, + data: param, + indices: idx.clone().into_boxed_slice(), + }); + }); + cons + }; + for id in ids { + edit = edit.replace_all_uses(id, built)?; + edit = edit.delete_node(id)?; } - }) - .collect::<Vec<_>>(); + } + + edit.set_param_types(new_param_types); + Ok(edit) + }); + assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument"); + + // Modify each return in the current function and the return types. + let return_nodes: Vec<_> = editor + .node_ids() + .filter(|id| editor.func().nodes[id.idx()].is_return()) + .collect(); let success = editor.edit(|mut edit| { for node in return_nodes { let Node::Return { control, data } = edit.get_node(node) else { @@ -114,17 +155,15 @@ pub fn interprocedural_sroa( } edit.set_return_types(new_return_types); - Ok(edit) }); assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument"); - // Finally, update calls of this function - // In particular, we actually don't have to update the call node at all but have to update - // its DataProjection users + // Finally, update calls of this function. for (caller, callsite) in callsites { let editor = &mut editors[caller.idx()]; assert!(editor.func_id() == caller); + let projs = editor.get_users(callsite).collect::<Vec<_>>(); for proj_id in projs { let Node::DataProjection { data: _, selection } = editor.node(proj_id) else { @@ -134,6 +173,40 @@ pub fn interprocedural_sroa( let typ = types[caller.idx()][proj_id.idx()]; replace_returned_value(editor, proj_id, typ, new_return_info, callsite); } + + let (control, callee, dc_args, args) = + editor.func().nodes[callsite.idx()].try_call().unwrap(); + let dc_args = dc_args.clone(); + let args = args.clone(); + let success = editor.edit(|mut edit| { + let mut new_args = vec![]; + for (idx, (data_id, update_info)) in + args.iter().zip(old_param_type_map.iter()).enumerate() + { + if let IndexTree::Leaf(new_idx) = update_info { + // Unchanged parameter value + assert!(new_args.len() == *new_idx); + new_args.push(*data_id); + } else { + // SROA'd parameter value + let reads = generate_reads_edit(&mut edit, param_types[idx], *data_id); + reads.zip(update_info).for_each(|_, (read_id, ret_idx)| { + assert!(new_args.len() == **ret_idx); + new_args.push(*read_id); + }); + } + } + let new_call = edit.add_node(Node::Call { + control, + function: callee, + dynamic_constants: dc_args, + args: new_args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(callsite, new_call)?; + edit = edit.delete_node(callsite)?; + Ok(edit) + }); + assert!(success); } } } diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs index e658ff8818246b31b4dbf3c9dd7ad0372c59f3c6..2718f99d82321d6ddce59320ffc5ea2b9bbdc2cf 100644 --- a/hercules_opt/src/sroa.rs +++ b/hercules_opt/src/sroa.rs @@ -447,7 +447,7 @@ pub fn sroa( field_map.insert(node, generate_reads(editor, types[&node], node)); } Node::Constant { id } => { - field_map.insert(node, generate_constant_fields(editor, id)); + field_map.insert(node, generate_constant_fields(editor, id, node)); to_delete.push(node); } _ => { @@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID // Given a constant cnst adds node to the function which are the constant values of each field and // returns a list of pairs of indices and the node that holds that index -fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> { +fn generate_constant_fields( + editor: &mut FunctionEditor, + cnst: ConstantID, + old_node: NodeID, +) -> IndexTree<NodeID> { let cs: Option<Vec<ConstantID>> = if let Some(cs) = editor.get_constant(cnst).try_product_fields() { Some(cs.into()) @@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In if let Some(cs) = cs { let mut fields = vec![]; for c in cs { - fields.push(generate_constant_fields(editor, c)); + fields.push(generate_constant_fields(editor, c, old_node)); } IndexTree::Node(fields) } else { let mut node = None; editor.edit(|mut edit| { node = Some(edit.add_node(Node::Constant { id: cnst })); + edit.sub_edit(old_node, node.unwrap()); Ok(edit) }); IndexTree::Leaf(node.expect("Add node cannot fail")) diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index 4d02b2cdb5f843774ebae9b873388e03c82071b0..931e78f8196eba77ff3f9015ab7f4561b1153c57 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @image_loop for r = 0 to row { for c = 0 to col { @l2 let l2_dist : f32[num_ctrl_pts]; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { let v1 = input[0, r, c] - ctrl_pts[cp, 0]; let v2 = input[1, r, c] - ctrl_pts[cp, 1]; let v3 = input[2, r, c] - ctrl_pts[cp, 2]; @@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @channel_loop for chan = 0 to CHAN { let chan_val : f32 = 0.0; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { chan_val += l2_dist[cp] * weights[cp, chan]; } diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch index 8f22b37d4c3a77a31e8d2467c8e6e130c281513f..6fc8adbb1fff30856978ce201af09fd2e02d6298 100644 --- a/juno_samples/cava/src/cpu.sch +++ b/juno_samples/cava/src/cpu.sch @@ -115,7 +115,7 @@ array-slf(fuse4); simpl!(fuse4); let par = fuse4@image_loop \ fuse4@channel_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let fuse4_body = outline(split.cava_3.fj2); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bacfd3abca363a6dd93496c1adb23fa54c860b9f..0ef466c00d08f3edffe02bad9612ce23ebf14768 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -117,9 +117,9 @@ fixpoint { simpl!(fuse4); array-slf(fuse4); simpl!(fuse4); -//fork-tile[2, 0, false, true](fuse4@channel_loop); -//fork-split(fuse4@channel_loop); -//clean-monoid-reduces(fuse4); +fork-tile[2, 0, false, true](fuse4@channel_loop); +let out = fork-split(fuse4@channel_loop); +fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); no-memset(fuse5@res1); @@ -133,6 +133,13 @@ simpl!(fuse5); array-slf(fuse5); simpl!(fuse5); +fork-tile[4, 1, false, true](fuse4); +fork-tile[8, 0, false, true](fuse4); +fork-interchange[1, 2](fuse4); +let split = fork-split(fuse4); +fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); +fork-coalesce(split.cava_3.fj2); + delete-uncalled(*); simpl!(*); diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs index 1810a24670d8fa7325b2ff353ba70756824a0269..47e3b1b3542217b9a17905d8a52ef431fd97541f 100644 --- a/juno_samples/cava/src/lib.rs +++ b/juno_samples/cava/src/lib.rs @@ -124,9 +124,9 @@ pub struct CavaInputs { #[clap(long = "output-verify", value_name = "PATH")] pub output_verify: Option<String>, pub cam_model: String, - #[clap(short, long)] + #[clap(long)] pub crop_rows: Option<usize>, - #[clap(short, long)] + #[clap(long)] pub crop_cols: Option<usize>, } diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index ec9e423dc4c160d08b61eaf45d2b75329886f94e..3e1321c517b55f72e529e8a6df46d9d55bf59efb 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -26,14 +26,14 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); simpl!(gaussian_smoothing, gaussian_smoothing_body); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); @@ -42,15 +42,15 @@ fixpoint { simpl!(laplacian_estimate); let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; +let body = split._1_laplacian_estimate.fj2; let laplacian_estimate_body = outline(body); fork-coalesce(laplacian_estimate, laplacian_estimate_body); simpl!(laplacian_estimate, laplacian_estimate_body); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); @@ -59,10 +59,10 @@ fixpoint { simpl!(zero_crossings); let par = zero_crossings@image_loop \ zero_crossings@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; +let body = split._2_zero_crossings.fj2; let zero_crossings_body = outline(body); fork-coalesce(zero_crossings, zero_crossings_body); simpl!(zero_crossings, zero_crossings_body); @@ -86,7 +86,7 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[16, 0, false, false](max_gradient); +fork-tile[32, 0, false, false](max_gradient); let split = fork-split(max_gradient); clean-monoid-reduces(max_gradient); let out = outline(split._4_max_gradient.fj1); @@ -105,7 +105,7 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); fork-tile[4, 1, false, false](reject_zero_crossings); -fork-tile[4, 0, false, false](reject_zero_crossings); +fork-tile[8, 0, false, false](reject_zero_crossings); fork-interchange[1, 2](reject_zero_crossings); let split = fork-split(reject_zero_crossings); let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 58f364dc1cea77b27c83ba2340bfbd11fde07f31..3e49cb365b186037cf6f380c3aa6a4d3b483fb5b 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Copy data for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else input[row + i - r, col + j - r]; - } - } - // Compute pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else input[row + i - r, col + j - r]; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else input[row + i - r, col + j - r]; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else input[row + i - r, col + j - r]; + eroded = min!(eroded, filter * structure[i, j]); } } @@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Data copy for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; - } - } - // Compute the pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + eroded = min!(eroded, filter * structure[i, j]); } } @@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>( let gx = 0; let gy = 0; - for i = 0 to sb { + @filter_loop for i = 0 to sb { for j = 0 to sb { let val = input[if row + i < sbr then 0 else if row + i - sbr > n - 1 then n - 1 diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 7ee2904f7d1bb59780390360dbd9abc6b3934aba..666f6cef30f0c0ecedef57f02e50bc25d2b26b8f 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -26,22 +26,49 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); +fork-dim-merge(gaussian_smoothing@filter_loop); +unforkify(gaussian_smoothing@filter_loop); +simpl!(gaussian_smoothing); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +fork-dim-merge(gaussian_smoothing); +fork-tile[32, 0, false, true](gaussian_smoothing); +simpl!(gaussian_smoothing); +fork-split(gaussian_smoothing); +simpl!(gaussian_smoothing); + +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); +fork-dim-merge(laplacian_estimate@filter_loop); +unforkify(laplacian_estimate@filter_loop); +simpl!(laplacian_estimate); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +fork-dim-merge(laplacian_estimate); +fork-tile[32, 0, false, true](laplacian_estimate); +simpl!(laplacian_estimate); +fork-split(laplacian_estimate); +simpl!(laplacian_estimate); + +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); fork-coalesce(zero_crossings); } simpl!(zero_crossings); +fork-dim-merge(zero_crossings@filter_loop); +unforkify(zero_crossings@filter_loop); +simpl!(zero_crossings); + +fork-dim-merge(zero_crossings); +fork-tile[32, 0, false, true](zero_crossings); +simpl!(zero_crossings); +fork-split(zero_crossings); +simpl!(zero_crossings); no-memset(gradient@res); fixpoint { @@ -53,6 +80,15 @@ predication(gradient); simpl!(gradient); predication(gradient); simpl!(gradient); +fork-dim-merge(gradient@filter_loop); +unforkify(gradient@filter_loop); +simpl!(gradient); + +fork-dim-merge(gradient); +fork-tile[32, 0, false, true](gradient); +simpl!(gradient); +fork-split(gradient); +simpl!(gradient); fixpoint { forkify(max_gradient); @@ -88,6 +124,12 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); +fork-dim-merge(reject_zero_crossings); +fork-tile[32, 0, false, true](reject_zero_crossings); +simpl!(reject_zero_crossings); +fork-split(reject_zero_crossings); +simpl!(reject_zero_crossings); + async-call(edge_detection@le, edge_detection@zc); simpl!(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index d2813388e0e7a1d7bd1696ffbb641e629096e2c2..7bc2083cf07c063eccda5855fa4fed3bfca91f87 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,24 +1,11 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(not(feature = "cuda"))] - { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .schedule_in_src("cpu.sch") - .unwrap() - .build() - .unwrap(); - } - #[cfg(feature = "cuda")] - { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - } + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .schedule_in_src("matmul.sch") + .unwrap() + .build() + .unwrap(); } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch deleted file mode 100644 index 69f1811d08a5691de6dc10fa86f0720e416fb85a..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/cpu.sch +++ /dev/null @@ -1,61 +0,0 @@ -macro optimize!(X) { - gvn(X); - phi-elim(X); - dce(X); - ip-sroa(X); - sroa(X); - dce(X); - gvn(X); - phi-elim(X); - dce(X); -} - -macro codegen-prep!(X) { - optimize!(X); - gcm(X); - float-collections(X); - dce(X); - gcm(X); -} - -macro forkify!(X) { - fixpoint { - forkify(X); - fork-guard-elim(X); - } -} - -macro fork-tile { - fork-tile[n, 0, false, true](X); -} - -macro parallelize!(X) { - parallel-fork(X); - parallel-reduce(X); -} - -macro unforkify!(X) { - fork-split(X); - unforkify(X); -} - -optimize!(*); -forkify!(*); -associative(matmul@outer); - -// Parallelize by computing output array as 16 chunks -let par = matmul@outer \ matmul@inner; -fork-tile; -let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); -parallelize!(outer \ inner); - -let body = outline(inner); -cpu(body); - -// Tile for cache, assuming 64B cache lines -fork-tile; -let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); - -reduce-slf(inner); -unforkify!(body); -codegen-prep!(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch deleted file mode 100644 index 76808149e7b99dce97e43f0e936536d3d13b7417..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/gpu.sch +++ /dev/null @@ -1,26 +0,0 @@ -phi-elim(*); - -forkify(*); -fork-guard-elim(*); -dce(*); - -fixpoint { - reduce-slf(*); - slf(*); - infer-schedules(*); -} -fork-coalesce(*); -infer-schedules(*); -dce(*); -rewrite(*); -fixpoint { - simplify-cfg(*); - dce(*); -} - -ip-sroa(*); -sroa(*); -dce(*); - -float-collections(*); -gcm(*); diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch new file mode 100644 index 0000000000000000000000000000000000000000..306997f58eb217f9ce301dc18e418c412e6df621 --- /dev/null +++ b/juno_samples/matmul/src/matmul.sch @@ -0,0 +1,81 @@ +macro optimize!(X) { + gvn(X); + phi-elim(X); + dce(X); + ip-sroa(X); + sroa(X); + dce(X); + gvn(X); + phi-elim(X); + dce(X); +} + +macro codegen-prep!(X) { + optimize!(X); + gcm(X); + float-collections(X); + dce(X); + gcm(X); +} + +macro forkify!(X) { + fixpoint { + forkify(X); + fork-guard-elim(X); + } +} + +macro fork-tile { + fork-tile[n, 0, false, true](X); +} + +macro parallelize!(X) { + parallel-fork(X); + parallel-reduce(X); +} + +macro unforkify!(X) { + fork-split(X); + unforkify(X); +} + +optimize!(*); +forkify!(*); + +if feature("cuda") { + fixpoint { + reduce-slf(*); + slf(*); + infer-schedules(*); + } + fork-coalesce(*); + infer-schedules(*); + dce(*); + rewrite(*); + fixpoint { + simplify-cfg(*); + dce(*); + } + + optimize!(*); + codegen-prep!(*); +} else { + associative(matmul@outer); + + // Parallelize by computing output array as 16 chunks + let par = matmul@outer \ matmul@inner; + fork-tile; + let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); + parallelize!(outer \ inner); + + let body = outline(inner); + cpu(body); + + // Tile for cache, assuming 64B cache lines + fork-tile; + let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); + + reduce-slf(inner); + unforkify!(body); + codegen-prep!(*); +} diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index fd614b42a55488bfcda64a853105fd40e53ff7bc..5fc73db9d369be5ef9695f37ad1f39122facf991 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) { elements_surrounding_elements, normals, } = read_domain_geometry(data_file, block_size); - let mut variables = initialize_variables(nelr, ff_variable.as_slice()); - let mut variables = HerculesMutBox::from(variables.as_mut_slice()); + let mut variables = initialize_variables(nelr, &ff_variable); + + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals = HerculesImmBox::from(normals.as_slice()); - let ff_variable = HerculesImmBox::from(ff_variable.as_slice()); - let ff_fc_density_energy = vec![ - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ]; - let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice()); - let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z]; - let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice()); - let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z]; - let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice()); - let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z]; - let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice()); + + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function("cfd bench euler", |b| { b.iter(|| { async_std::task::block_on(async { r.run( nelr as u64, iterations as u64, - variables.to(), + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), areas.to(), elements_surrounding_elements.to(), - normals.to(), - ff_variable.to(), - ff_fc_density_energy.to(), - ff_fc_momentum_x.to(), - ff_fc_momentum_y.to(), - ff_fc_momentum_z.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, ) .await }); @@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) { elements_surrounding_elements, normals, } = read_domain_geometry(data_file, block_size); - let mut variables = initialize_variables(nelr, ff_variable.as_slice()); - let mut variables = HerculesMutBox::from(variables.as_mut_slice()); + let mut variables = initialize_variables(nelr, &ff_variable); + + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals = HerculesImmBox::from(normals.as_slice()); - let ff_variable = HerculesImmBox::from(ff_variable.as_slice()); - let ff_fc_density_energy = vec![ - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ]; - let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice()); - let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z]; - let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice()); - let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z]; - let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice()); - let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z]; - let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice()); + + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function("cfd bench pre-euler", |b| { b.iter(|| { async_std::task::block_on(async { r.run( nelr as u64, iterations as u64, - variables.to(), + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), areas.to(), elements_surrounding_elements.to(), - normals.to(), - ff_variable.to(), - ff_fc_density_energy.to(), - ff_fc_momentum_x.to(), - ff_fc_momentum_y.to(), - ff_fc_momentum_z.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, ) .await }); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 1244f80e54fdad43f58e5c5a5af44646b7a83e89..7a284a9a3bed0c1800366c64ceeca20265d845a6 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -12,10 +12,11 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); simpl!(*); -ip-sroa[false](*); -sroa[false](*); +ip-sroa[true](*); +sroa[true](*); predication(*); const-inline(*); simpl!(*); @@ -24,8 +25,31 @@ fixpoint { fork-guard-elim(*); } simpl!(*); -no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); -parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._6_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._7_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._8_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); -unforkify(*); +unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 6329c5046e15ca0bce646f4a778dcf8c8d781656..518c656d99652dd9813e6aa5c8e8e84388836d47 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -14,8 +14,8 @@ simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); delete-uncalled(*); simpl!(*); -ip-sroa[false](*); -sroa[false](*); +ip-sroa[true](*); +sroa[true](*); predication(*); const-inline(*); simpl!(*); @@ -24,7 +24,38 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); +unforkify(compute_flux@inner_loop); -unforkify(*); +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); +fork-tile[32, 0, false, false](compute_flux_contributions); +let split = fork-split(compute_flux_contributions); +let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); +fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); +simpl!(compute_flux_contributions, compute_flux_contributions_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._7_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._8_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._9_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); + +unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch index 7f7ee42ca9af7e6026b304c7b27d00bbb9a3f035..3700f79d24aa22657020ace6a37931feb49048a4 100644 --- a/juno_samples/rodinia/cfd/src/gpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch @@ -1,23 +1,44 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(euler); -gpu(auto.euler); - -inline(auto.euler); -inline(auto.euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); +gpu(copy_vars, compute_step_factor, compute_flux, time_step); -sroa[false](auto.euler); -dce(*); -float-collections(*); -dce(*); +simpl!(*); +ip-sroa[true](*); +sroa[true](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); +unforkify(compute_flux@inner_loop); -gcm(*); +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); + +gcm(*); diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch index 33c46dabe92854850f55fe0f0101cfe022947cf0..d6db675b0682490a84ba30408c3dd96993dce0f9 100644 --- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch @@ -1,23 +1,47 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); - -let auto = auto-outline(pre_euler); -gpu(auto.pre_euler); - -inline(auto.pre_euler); -inline(auto.pre_euler); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} + +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); delete-uncalled(*); +gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step); -sroa[false](auto.pre_euler); -dce(*); -float-collections(*); -dce(*); +simpl!(*); +ip-sroa[true](*); +sroa[true](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); +unforkify(compute_flux@inner_loop); -gcm(*); +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux_contributions); +fork-split(compute_flux_contributions); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); + +gcm(*); diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index 39384c0d6a322f22a7c7b7ef7e51348d80a36107..d61df4c5b7bb0c7915d8605debd72e375a5482b0 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -24,123 +24,193 @@ pub struct CFDInputs { fn run_euler( nelr: usize, iterations: usize, - mut variables: AlignedSlice<f32>, + mut variables: Variables, areas: &[f32], elements_surrounding_elements: &[i32], - normals: &[f32], - ff_variable: &[f32], + normals: &Normals, + ff_variable: &Variable, ff_fc_density_energy: &Float3, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, -) -> Vec<f32> { - let mut variables = HerculesMutBox::from(variables.as_mut_slice()); +) -> Variables { + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let areas = HerculesImmBox::from(areas); let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements); - let normals = HerculesImmBox::from(normals); - let ff_variable = HerculesImmBox::from(ff_variable); - - // TODO: Make hercules box handle structs, for now we'll copy into a vec - let ff_fc_density_energy = vec![ - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ]; - let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice()); - let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z]; - let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice()); - let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z]; - let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice()); - let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z]; - let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice()); - let mut runner = runner!(euler); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); - HerculesMutBox::from(async_std::task::block_on(async { + let mut runner = runner!(euler); + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, iterations as u64, - variables.to(), + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), areas.to(), elements_surrounding_elements.to(), - normals.to(), - ff_variable.to(), - ff_fc_density_energy.to(), - ff_fc_momentum_x.to(), - ff_fc_momentum_y.to(), - ff_fc_momentum_z.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, ) .await - })) - .as_slice() - .to_vec() + }); + + Variables { + density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()), + momentum: Momentum { + x: AlignedSlice::from_slice(HerculesMutBox::from(momentum_x).as_slice()), + y: AlignedSlice::from_slice(HerculesMutBox::from(momentum_y).as_slice()), + z: AlignedSlice::from_slice(HerculesMutBox::from(momentum_z).as_slice()), + }, + energy: AlignedSlice::from_slice(HerculesMutBox::from(energy).as_slice()), + } } fn run_pre_euler( nelr: usize, iterations: usize, - mut variables: AlignedSlice<f32>, + mut variables: Variables, areas: &[f32], elements_surrounding_elements: &[i32], - normals: &[f32], - ff_variable: &[f32], + normals: &Normals, + ff_variable: &Variable, ff_fc_density_energy: &Float3, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, -) -> Vec<f32> { - let mut variables = HerculesMutBox::from(variables.as_mut_slice()); +) -> Variables { + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let areas = HerculesImmBox::from(areas); let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements); - let normals = HerculesImmBox::from(normals); - let ff_variable = HerculesImmBox::from(ff_variable); - - // TODO: Make hercules box handle structs, for now we'll copy into a vec - let ff_fc_density_energy = vec![ - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ]; - let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice()); - let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z]; - let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice()); - let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z]; - let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice()); - let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z]; - let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice()); - - let mut runner = runner!(pre_euler); - let variables = variables.to(); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); - HerculesMutBox::from(async_std::task::block_on(async { + let mut runner = runner!(pre_euler); + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, iterations as u64, - variables, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), areas.to(), elements_surrounding_elements.to(), - normals.to(), - ff_variable.to(), - ff_fc_density_energy.to(), - ff_fc_momentum_x.to(), - ff_fc_momentum_y.to(), - ff_fc_momentum_z.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, ) .await - })) - .as_slice() - .to_vec() + }); + + Variables { + density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()), + momentum: Momentum { + x: AlignedSlice::from_slice(HerculesMutBox::from(momentum_x).as_slice()), + y: AlignedSlice::from_slice(HerculesMutBox::from(momentum_y).as_slice()), + z: AlignedSlice::from_slice(HerculesMutBox::from(momentum_z).as_slice()), + }, + energy: AlignedSlice::from_slice(HerculesMutBox::from(energy).as_slice()), + } } fn compare_float(x: f32, y: f32) -> bool { (x - y).abs() < 1e-5 } -fn compare_floats(xs: &[f32], ys: &[f32]) -> bool { - xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y)) +fn compare_floats(xs: &Variables, ys: &Variables) -> bool { + let xs_density = xs.density.as_slice(); + let xs_momentum_x = xs.momentum.x.as_slice(); + let xs_momentum_y = xs.momentum.y.as_slice(); + let xs_momentum_z = xs.momentum.z.as_slice(); + let xs_energy = xs.energy.as_slice(); + + let ys_density = ys.density.as_slice(); + let ys_momentum_x = ys.momentum.x.as_slice(); + let ys_momentum_y = ys.momentum.y.as_slice(); + let ys_momentum_z = ys.momentum.z.as_slice(); + let ys_energy = ys.energy.as_slice(); + + xs_density.len() == ys_density.len() + && xs_density + .iter() + .zip(ys_density.iter()) + .all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_x.len() == ys_momentum_x.len() + && xs_momentum_x + .iter() + .zip(ys_momentum_x.iter()) + .all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_y.len() == ys_momentum_y.len() + && xs_momentum_y + .iter() + .zip(ys_momentum_y.iter()) + .all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_z.len() == ys_momentum_z.len() + && xs_momentum_z + .iter() + .zip(ys_momentum_z.iter()) + .all(|(x, y)| compare_float(*x, *y)) + && xs_energy.len() == ys_energy.len() + && xs_energy + .iter() + .zip(ys_energy.iter()) + .all(|(x, y)| compare_float(*x, *y)) } pub fn cfd_harness(args: CFDInputs) { @@ -151,8 +221,6 @@ pub fn cfd_harness(args: CFDInputs) { pre_euler, } = args; - assert!(block_size % 16 == 0, "Hercules expects all arrays to be 64-byte aligned, cfd uses structs of arrays that are annoying to deal with if the block_size is not a multiple of 16"); - let FarFieldConditions { ff_variable, ff_fc_momentum_x, @@ -168,7 +236,8 @@ pub fn cfd_harness(args: CFDInputs) { normals, } = read_domain_geometry(data_file, block_size); - let variables = initialize_variables(nelr, ff_variable.as_slice()); + let variables = initialize_variables(nelr, &ff_variable); + println!("Running CFD with nelr = {}.", nelr); let res_juno = if pre_euler { run_pre_euler( @@ -177,8 +246,8 @@ pub fn cfd_harness(args: CFDInputs) { variables.clone(), areas.as_slice(), elements_surrounding_elements.as_slice(), - normals.as_slice(), - ff_variable.as_slice(), + &normals, + &ff_variable, &ff_fc_density_energy, &ff_fc_momentum_x, &ff_fc_momentum_y, @@ -191,8 +260,8 @@ pub fn cfd_harness(args: CFDInputs) { variables.clone(), areas.as_slice(), elements_surrounding_elements.as_slice(), - normals.as_slice(), - ff_variable.as_slice(), + &normals, + &ff_variable, &ff_fc_density_energy, &ff_fc_momentum_x, &ff_fc_momentum_y, @@ -206,8 +275,8 @@ pub fn cfd_harness(args: CFDInputs) { variables, areas.as_slice(), elements_surrounding_elements.as_slice(), - normals.as_slice(), - ff_variable.as_slice(), + &normals, + &ff_variable, &ff_fc_density_energy, &ff_fc_momentum_x, &ff_fc_momentum_y, @@ -220,8 +289,8 @@ pub fn cfd_harness(args: CFDInputs) { variables, areas.as_slice(), elements_surrounding_elements.as_slice(), - normals.as_slice(), - ff_variable.as_slice(), + &normals, + &ff_variable, &ff_fc_density_energy, &ff_fc_momentum_x, &ff_fc_momentum_y, @@ -229,8 +298,7 @@ pub fn cfd_harness(args: CFDInputs) { ) }; - if !compare_floats(&res_juno, res_rust.as_slice()) { - assert_eq!(res_juno.len(), res_rust.as_slice().len()); + if !compare_floats(&res_juno, &res_rust) { panic!("Mismatch in results"); } } diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn index c200f2db9a2be4708c4cf1c90c60eef0b7d7a11e..979c2e9a16d3ff2d0b5c690af7645dbf10462139 100644 --- a/juno_samples/rodinia/cfd/src/pre_euler.jn +++ b/juno_samples/rodinia/cfd/src/pre_euler.jn @@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { } fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] { - let step_factors : f32[nelr]; + @res let step_factors : f32[nelr]; for i in 0..nelr { let density = variables.density[i]; @@ -109,10 +109,10 @@ fn compute_flux_contribution( fn compute_flux_contributions<nelr: usize>( variables: Variables::<nelr>, ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) { - let fc_momentum_x: Momentum::<nelr>; - let fc_momentum_y: Momentum::<nelr>; - let fc_momentum_z: Momentum::<nelr>; - let fc_density_energy: Momentum::<nelr>; + @res let fc_momentum_x: Momentum::<nelr>; + @res let fc_momentum_y: Momentum::<nelr>; + @res let fc_momentum_z: Momentum::<nelr>; + @res let fc_density_energy: Momentum::<nelr>; for i in 0..nelr { let density_i = variables.density[i]; @@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>( ff_fc_momentum_z: float3, ) -> Variables::<nelr> { const smoothing_coefficient : f32 = 0.2; - let fluxes: Variables::<nelr>; + @res let fluxes: Variables::<nelr>; - for i in 0..nelr { + @outer_loop for i in 0..nelr { let density_i = variables.density[i]; let momentum_i = float3 { x: variables.momentum.x[i], @@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>( let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 }; let flux_i_density_energy : f32 = 0.0; - for j in 0..NNB { + @inner_loop for j in 0..NNB { let nb = elements_surrounding_elements[j, i]; let normal = float3 { x: normals.x[j, i], @@ -328,7 +328,7 @@ fn time_step<nelr: usize>( } fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> { - let result : Variables::<nelr>; + @res let result : Variables::<nelr>; for i in 0..nelr { result.density[i] = variables.density[i]; diff --git a/juno_samples/rodinia/cfd/src/rust_cfd.rs b/juno_samples/rodinia/cfd/src/rust_cfd.rs index 42479df8eb2cd5cf86e70cafd212050a07d729b4..6d92c2b5501198513ab3e8bc7314df2eec3aff0f 100644 --- a/juno_samples/rodinia/cfd/src/rust_cfd.rs +++ b/juno_samples/rodinia/cfd/src/rust_cfd.rs @@ -58,19 +58,19 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { (GAMMA * pressure / density).sqrt() } -fn compute_step_factor(nelr: usize, variables: &[f32], areas: &[f32]) -> Vec<f32> { +fn compute_step_factor(nelr: usize, variables: &Variables, areas: &[f32]) -> Vec<f32> { let mut step_factors = vec![0.0; nelr]; for i in 0..nelr { - let density = variables[i + VAR_DENSITY * nelr]; + let density = variables.density[i]; let momentum = Float3 { - x: variables[i + (VAR_MOMENTUM + 0) * nelr], - y: variables[i + (VAR_MOMENTUM + 1) * nelr], - z: variables[i + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[i], + y: variables.momentum.y[i], + z: variables.momentum.z[i], }; - let density_energy = variables[i + VAR_DENSITY_ENERGY * nelr]; + let density_energy = variables.energy[i]; let velocity = compute_velocity(density, &momentum); let speed_sqd = compute_speed_sqd(&velocity); @@ -85,28 +85,36 @@ fn compute_step_factor(nelr: usize, variables: &[f32], areas: &[f32]) -> Vec<f32 fn compute_flux_euler( nelr: usize, - variables: &[f32], + variables: &Variables, elements_surrounding_elements: &[i32], - normals: &[f32], - ff_variable: &[f32], + normals: &Normals, + ff_variable: &Variable, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, ff_fc_density_energy: &Float3, -) -> Vec<f32> { +) -> Variables { let smoothing_coefficient: f32 = 0.2; - let mut fluxes = vec![0.0; nelr * NVAR]; + let mut fluxes = Variables { + density: AlignedSlice::of_len(nelr), + momentum: Momentum { + x: AlignedSlice::of_len(nelr), + y: AlignedSlice::of_len(nelr), + z: AlignedSlice::of_len(nelr), + }, + energy: AlignedSlice::of_len(nelr), + }; for i in 0..nelr { - let density_i = variables[i + VAR_DENSITY * nelr]; + let density_i = variables.density[i]; let momentum_i = Float3 { - x: variables[i + (VAR_MOMENTUM + 0) * nelr], - y: variables[i + (VAR_MOMENTUM + 1) * nelr], - z: variables[i + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[i], + y: variables.momentum.y[i], + z: variables.momentum.z[i], }; - let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr]; + let density_energy_i = variables.energy[i]; let velocity_i = compute_velocity(density_i, &momentum_i); let speed_sqd_i = compute_speed_sqd(&velocity_i); @@ -134,9 +142,9 @@ fn compute_flux_euler( for j in 0..NNB { let nb = elements_surrounding_elements[i + j * nelr]; let normal = Float3 { - x: normals[i + (j + 0 * NNB) * nelr], - y: normals[i + (j + 1 * NNB) * nelr], - z: normals[i + (j + 2 * NNB) * nelr], + x: normals.x[i + j * nelr], + y: normals.y[i + j * nelr], + z: normals.z[i + j * nelr], }; let normal_len = (normal.x * normal.x + normal.y * normal.y + normal.z * normal.z).sqrt(); @@ -144,13 +152,13 @@ fn compute_flux_euler( if nb >= 0 { // legitimate neighbor let nb = nb as usize; - let density_nb = variables[nb + VAR_DENSITY * nelr]; + let density_nb = variables.density[nb]; let momentum_nb = Float3 { - x: variables[nb + (VAR_MOMENTUM + 0) * nelr], - y: variables[nb + (VAR_MOMENTUM + 1) * nelr], - z: variables[nb + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[nb], + y: variables.momentum.y[nb], + z: variables.momentum.z[nb], }; - let density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr]; + let density_energy_nb = variables.energy[nb]; let velocity_nb = compute_velocity(density_nb, &momentum_nb); let speed_sqd_nb = compute_speed_sqd(&velocity_nb); let pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb); @@ -205,21 +213,21 @@ fn compute_flux_euler( } else if nb == -2 { // a far field boundary let factor = 0.5 * normal.x; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x); + flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x); flux_i_density_energy += factor * (ff_fc_density_energy.x + fc_i_density_energy.x); flux_i_momentum.x += factor * (ff_fc_momentum_x.x + fc_i_momentum_x.x); flux_i_momentum.y += factor * (ff_fc_momentum_y.x + fc_i_momentum_y.x); flux_i_momentum.z += factor * (ff_fc_momentum_z.x + fc_i_momentum_z.x); let factor = 0.5 * normal.y; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y); + flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y); flux_i_density_energy += factor * (ff_fc_density_energy.y + fc_i_density_energy.y); flux_i_momentum.x += factor * (ff_fc_momentum_x.y + fc_i_momentum_x.y); flux_i_momentum.y += factor * (ff_fc_momentum_y.y + fc_i_momentum_y.y); flux_i_momentum.z += factor * (ff_fc_momentum_z.y + fc_i_momentum_z.y); let factor = 0.5 * normal.z; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z); + flux_i_density += factor * (ff_variable.momentum.z + momentum_i.z); flux_i_density_energy += factor * (ff_fc_density_energy.z + fc_i_density_energy.z); flux_i_momentum.x += factor * (ff_fc_momentum_x.z + fc_i_momentum_x.z); flux_i_momentum.y += factor * (ff_fc_momentum_y.z + fc_i_momentum_y.z); @@ -227,11 +235,11 @@ fn compute_flux_euler( } } - fluxes[i + VAR_DENSITY * nelr] = flux_i_density; - fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x; - fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y; - fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z; - fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy; + fluxes.density[i] = flux_i_density; + fluxes.momentum.x[i] = flux_i_momentum.x; + fluxes.momentum.y[i] = flux_i_momentum.y; + fluxes.momentum.z[i] = flux_i_momentum.z; + fluxes.energy[i] = flux_i_density_energy; } fluxes @@ -239,7 +247,7 @@ fn compute_flux_euler( fn compute_flux_contributions( nelr: usize, - variables: &[f32], + variables: &Variables, ) -> (Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) { let mut fc_momentum_x = vec![0.0; nelr * NDIM]; let mut fc_momentum_y = vec![0.0; nelr * NDIM]; @@ -247,14 +255,14 @@ fn compute_flux_contributions( let mut fc_density_energy = vec![0.0; nelr * NDIM]; for i in 0..nelr { - let density_i = variables[i + VAR_DENSITY * nelr]; + let density_i = variables.density[i]; let momentum_i = Float3 { - x: variables[i + (VAR_MOMENTUM + 0) * nelr], - y: variables[i + (VAR_MOMENTUM + 1) * nelr], - z: variables[i + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[i], + y: variables.momentum.y[i], + z: variables.momentum.z[i], }; - let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr]; + let density_energy_i = variables.energy[i]; let velocity_i = compute_velocity(density_i, &momentum_i); let speed_sqd_i = compute_speed_sqd(&velocity_i); @@ -298,32 +306,40 @@ fn compute_flux_contributions( fn compute_flux_pre_euler( nelr: usize, - variables: &[f32], + variables: &Variables, elements_surrounding_elements: &[i32], - normals: &[f32], + normals: &Normals, fc_momentum_x: &[f32], fc_momentum_y: &[f32], fc_momentum_z: &[f32], fc_density_energy: &[f32], - ff_variable: &[f32], + ff_variable: &Variable, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, ff_fc_density_energy: &Float3, -) -> Vec<f32> { +) -> Variables { let smoothing_coefficient: f32 = 0.2; - let mut fluxes = vec![0.0; nelr * NVAR]; + let mut fluxes = Variables { + density: AlignedSlice::of_len(nelr), + momentum: Momentum { + x: AlignedSlice::of_len(nelr), + y: AlignedSlice::of_len(nelr), + z: AlignedSlice::of_len(nelr), + }, + energy: AlignedSlice::of_len(nelr), + }; for i in 0..nelr { - let density_i = variables[i + VAR_DENSITY * nelr]; + let density_i = variables.density[i]; let momentum_i = Float3 { - x: variables[i + (VAR_MOMENTUM + 0) * nelr], - y: variables[i + (VAR_MOMENTUM + 1) * nelr], - z: variables[i + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[i], + y: variables.momentum.y[i], + z: variables.momentum.z[i], }; - let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr]; + let density_energy_i = variables.energy[i]; let velocity_i = compute_velocity(density_i, &momentum_i); let speed_sqd_i = compute_speed_sqd(&velocity_i); @@ -363,22 +379,22 @@ fn compute_flux_pre_euler( for j in 0..NNB { let nb = elements_surrounding_elements[i + j * nelr]; let normal = Float3 { - x: normals[i + (j + 0 * NNB) * nelr], - y: normals[i + (j + 1 * NNB) * nelr], - z: normals[i + (j + 2 * NNB) * nelr], + x: normals.x[i + j * nelr], + y: normals.y[i + j * nelr], + z: normals.z[i + j * nelr], }; let normal_len = (normal.x * normal.x + normal.y * normal.y + normal.z * normal.z).sqrt(); if nb >= 0 { let nb = nb as usize; - let density_nb = variables[nb + VAR_DENSITY * nelr]; + let density_nb = variables.density[nb]; let momentum_nb = Float3 { - x: variables[nb + (VAR_MOMENTUM + 0) * nelr], - y: variables[nb + (VAR_MOMENTUM + 1) * nelr], - z: variables[nb + (VAR_MOMENTUM + 2) * nelr], + x: variables.momentum.x[nb], + y: variables.momentum.y[nb], + z: variables.momentum.z[nb], }; - let density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr]; + let density_energy_nb = variables.energy[nb]; let velocity_nb = compute_velocity(density_nb, &momentum_nb); let speed_sqd_nb = compute_speed_sqd(&velocity_nb); let pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb); @@ -443,21 +459,21 @@ fn compute_flux_pre_euler( flux_i_momentum.z += normal.z * pressure_i; } else if nb == -2 { let factor = 0.5 * normal.x; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x); + flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x); flux_i_density_energy += factor * (ff_fc_density_energy.x + fc_i_density_energy.x); flux_i_momentum.x += factor * (ff_fc_momentum_x.x + fc_i_momentum_x.x); flux_i_momentum.y += factor * (ff_fc_momentum_y.x + fc_i_momentum_y.x); flux_i_momentum.z += factor * (ff_fc_momentum_z.x + fc_i_momentum_z.x); let factor = 0.5 * normal.y; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y); + flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y); flux_i_density_energy += factor * (ff_fc_density_energy.y + fc_i_density_energy.y); flux_i_momentum.x += factor * (ff_fc_momentum_x.y + fc_i_momentum_x.y); flux_i_momentum.y += factor * (ff_fc_momentum_y.y + fc_i_momentum_y.y); flux_i_momentum.z += factor * (ff_fc_momentum_z.y + fc_i_momentum_z.y); let factor = 0.5 * normal.z; - flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z); + flux_i_density += factor * (ff_variable.momentum.z + momentum_i.z); flux_i_density_energy += factor * (ff_fc_density_energy.z + fc_i_density_energy.z); flux_i_momentum.x += factor * (ff_fc_momentum_x.z + fc_i_momentum_x.z); flux_i_momentum.y += factor * (ff_fc_momentum_y.z + fc_i_momentum_y.z); @@ -465,11 +481,11 @@ fn compute_flux_pre_euler( } } - fluxes[i + VAR_DENSITY * nelr] = flux_i_density; - fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x; - fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y; - fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z; - fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy; + fluxes.density[i] = flux_i_density; + fluxes.momentum.x[i] = flux_i_momentum.x; + fluxes.momentum.y[i] = flux_i_momentum.y; + fluxes.momentum.z[i] = flux_i_momentum.z; + fluxes.energy[i] = flux_i_density_energy; } fluxes @@ -478,47 +494,42 @@ fn compute_flux_pre_euler( fn time_step( j: usize, nelr: usize, - old_variables: &[f32], - variables: &mut AlignedSlice<f32>, + old_variables: &Variables, + variables: &mut Variables, step_factors: &[f32], - fluxes: &[f32], + fluxes: &Variables, ) { for i in 0..nelr { let factor = step_factors[i] / (RK + 1 - j) as f32; - variables[i + VAR_DENSITY * nelr] = - old_variables[i + VAR_DENSITY * nelr] + factor * fluxes[i + VAR_DENSITY * nelr]; - variables[i + (VAR_MOMENTUM + 0) * nelr] = old_variables[i + (VAR_MOMENTUM + 0) * nelr] - + factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr]; - variables[i + (VAR_MOMENTUM + 1) * nelr] = old_variables[i + (VAR_MOMENTUM + 1) * nelr] - + factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr]; - variables[i + (VAR_MOMENTUM + 2) * nelr] = old_variables[i + (VAR_MOMENTUM + 2) * nelr] - + factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr]; - variables[i + VAR_DENSITY_ENERGY * nelr] = old_variables[i + VAR_DENSITY_ENERGY * nelr] - + factor * fluxes[i + VAR_DENSITY_ENERGY * nelr]; + variables.density[i] = old_variables.density[i] + factor * fluxes.density[i]; + variables.momentum.x[i] = old_variables.momentum.x[i] + factor * fluxes.momentum.x[i]; + variables.momentum.y[i] = old_variables.momentum.y[i] + factor * fluxes.momentum.y[i]; + variables.momentum.z[i] = old_variables.momentum.z[i] + factor * fluxes.momentum.z[i]; + variables.energy[i] = old_variables.energy[i] + factor * fluxes.energy[i]; } } pub fn euler( nelr: usize, iterations: usize, - mut variables: AlignedSlice<f32>, + mut variables: Variables, areas: &[f32], elements_surrounding_elements: &[i32], - normals: &[f32], - ff_variable: &[f32], + normals: &Normals, + ff_variable: &Variable, ff_fc_density_energy: &Float3, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, -) -> AlignedSlice<f32> { +) -> Variables { for i in 0..iterations { let old_variables = variables.clone(); - let step_factors = compute_step_factor(nelr, variables.as_slice(), areas); + let step_factors = compute_step_factor(nelr, &variables, areas); for j in 0..RK { let fluxes = compute_flux_euler( nelr, - variables.as_slice(), + &variables, elements_surrounding_elements, normals, ff_variable, @@ -530,7 +541,7 @@ pub fn euler( time_step( j, nelr, - old_variables.as_slice(), + &old_variables, &mut variables, &step_factors, &fluxes, @@ -544,26 +555,26 @@ pub fn euler( pub fn pre_euler( nelr: usize, iterations: usize, - mut variables: AlignedSlice<f32>, + mut variables: Variables, areas: &[f32], elements_surrounding_elements: &[i32], - normals: &[f32], - ff_variable: &[f32], + normals: &Normals, + ff_variable: &Variable, ff_fc_density_energy: &Float3, ff_fc_momentum_x: &Float3, ff_fc_momentum_y: &Float3, ff_fc_momentum_z: &Float3, -) -> AlignedSlice<f32> { +) -> Variables { for i in 0..iterations { let old_variables = variables.clone(); - let step_factors = compute_step_factor(nelr, variables.as_slice(), areas); + let step_factors = compute_step_factor(nelr, &variables, areas); for j in 0..RK { let (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy) = - compute_flux_contributions(nelr, variables.as_slice()); + compute_flux_contributions(nelr, &variables); let fluxes = compute_flux_pre_euler( nelr, - variables.as_slice(), + &variables, elements_surrounding_elements, normals, &fc_momentum_x, @@ -579,7 +590,7 @@ pub fn pre_euler( time_step( j, nelr, - old_variables.as_slice(), + &old_variables, &mut variables, &step_factors, &fluxes, diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs index b996f057c50c6eb404ea8a9db2d0d92dc738ac2a..0da0025162c919d2e420d6bbb0e69f98d83e52ff 100644 --- a/juno_samples/rodinia/cfd/src/setup.rs +++ b/juno_samples/rodinia/cfd/src/setup.rs @@ -1,10 +1,10 @@ +use std::fmt::{Debug, Error, Formatter}; use std::fs::File; use std::io::Read; +use std::ops::{Index, IndexMut}; use std::path::Path; use std::str::FromStr; -use std::ops::{Index, IndexMut}; - use nom::Parser; use crate::rust_cfd; @@ -14,10 +14,6 @@ pub const ff_mach: f32 = 1.2; pub const NDIM: usize = 3; pub const NNB: usize = 4; pub const RK: usize = 3; -pub const VAR_DENSITY: usize = 0; -pub const VAR_MOMENTUM: usize = 1; -pub const VAR_DENSITY_ENERGY: usize = VAR_MOMENTUM + NDIM; -pub const NVAR: usize = VAR_DENSITY_ENERGY + 1; pub const deg_angle_of_attack: f32 = 0.0; #[repr(align(64))] @@ -31,6 +27,15 @@ pub struct AlignedSlice<T> { length: usize, } +impl<T> Debug for AlignedSlice<T> +where + T: Debug +{ + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { + write!(f, "{:?}", self.as_slice()) + } +} + impl<T> AlignedSlice<T> where T: Default, @@ -53,6 +58,17 @@ where } } +impl<T> AlignedSlice<T> +where + T: Clone + Default, +{ + pub fn from_slice(slice: &[T]) -> Self { + let mut res = Self::of_len(slice.len()); + res.as_mut_slice().clone_from_slice(slice); + res + } +} + impl<T> AlignedSlice<T> { pub fn as_slice(&self) -> &[T] { &self.slice[self.offset..self.offset + self.length] @@ -90,15 +106,20 @@ where } } -#[repr(C)] pub struct Float3 { pub x: f32, pub y: f32, pub z: f32, } +pub struct Variable { + pub density: f32, + pub momentum: Float3, + pub energy: f32, +} + pub struct FarFieldConditions { - pub ff_variable: AlignedSlice<f32>, + pub ff_variable: Variable, pub ff_fc_momentum_x: Float3, pub ff_fc_momentum_y: Float3, pub ff_fc_momentum_z: Float3, @@ -106,14 +127,14 @@ pub struct FarFieldConditions { } pub fn set_far_field_conditions() -> FarFieldConditions { - let mut ff_variable: AlignedSlice<f32> = AlignedSlice::of_len(NVAR); + let mut ff_variable = Variable { density: 0.0, momentum: Float3 { x: 0.0, y: 0.0, z: 0.0 }, energy: 0.0 }; let angle_of_attack = std::f32::consts::PI / 180.0 * deg_angle_of_attack; - ff_variable[VAR_DENSITY] = 1.4; + ff_variable.density = 1.4; let ff_pressure = 1.0; - let ff_speed_of_sound = (GAMMA * ff_pressure / ff_variable[VAR_DENSITY]).sqrt(); + let ff_speed_of_sound = (GAMMA * ff_pressure / ff_variable.density).sqrt(); let ff_speed = ff_mach * ff_speed_of_sound; let ff_velocity = Float3 { @@ -122,23 +143,23 @@ pub fn set_far_field_conditions() -> FarFieldConditions { z: 0.0, }; - ff_variable[VAR_MOMENTUM + 0] = ff_variable[VAR_DENSITY] * ff_velocity.x; - ff_variable[VAR_MOMENTUM + 1] = ff_variable[VAR_DENSITY] * ff_velocity.y; - ff_variable[VAR_MOMENTUM + 2] = ff_variable[VAR_DENSITY] * ff_velocity.z; + ff_variable.momentum.x = ff_variable.density * ff_velocity.x; + ff_variable.momentum.y = ff_variable.density * ff_velocity.y; + ff_variable.momentum.z = ff_variable.density * ff_velocity.z; - ff_variable[VAR_DENSITY_ENERGY] = - ff_variable[VAR_DENSITY] * (0.5 * (ff_speed * ff_speed)) + (ff_pressure / (GAMMA - 1.0)); + ff_variable.energy = + ff_variable.density * (0.5 * (ff_speed * ff_speed)) + (ff_pressure / (GAMMA - 1.0)); let ff_momentum = Float3 { - x: ff_variable[VAR_MOMENTUM + 0], - y: ff_variable[VAR_MOMENTUM + 1], - z: ff_variable[VAR_MOMENTUM + 2], + x: ff_variable.momentum.x, + y: ff_variable.momentum.y, + z: ff_variable.momentum.z, }; let (ff_fc_momentum_x, ff_fc_momentum_y, ff_fc_momentum_z, ff_fc_density_energy) = rust_cfd::compute_flux_contribution( - ff_variable[VAR_DENSITY], + ff_variable.density, &ff_momentum, - ff_variable[VAR_DENSITY_ENERGY], + ff_variable.energy, ff_pressure, &ff_velocity, ); @@ -152,11 +173,17 @@ pub fn set_far_field_conditions() -> FarFieldConditions { } } +pub struct Normals { + pub x: AlignedSlice<f32>, + pub y: AlignedSlice<f32>, + pub z: AlignedSlice<f32>, +} + pub struct GeometryData { pub nelr: usize, pub areas: AlignedSlice<f32>, pub elements_surrounding_elements: AlignedSlice<i32>, - pub normals: AlignedSlice<f32>, + pub normals: Normals, } pub fn read_domain_geometry<P>(path: P, block_size: usize) -> GeometryData @@ -183,7 +210,11 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo let mut areas: AlignedSlice<f32> = AlignedSlice::of_len(nelr); let mut elements_surrounding_elements: AlignedSlice<i32> = AlignedSlice::of_len(nelr * NNB); - let mut normals: AlignedSlice<f32> = AlignedSlice::of_len(NDIM * NNB * nelr); + let mut normals = Normals { + x: AlignedSlice::of_len(NNB * nelr), + y: AlignedSlice::of_len(NNB * nelr), + z: AlignedSlice::of_len(NNB * nelr), + }; // read in data let mut text = text; @@ -209,14 +240,21 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo let val = i32::from_str(val).unwrap(); let val = if neg { -val } else { val }; elements_surrounding_elements[i + j * nelr] = if val < 0 { -1 } else { val } - 1; // it's coming in with Fortran numbering - - for k in 0..NDIM { - let t = nom::character::complete::multispace0(text)?.0; - let (t, val) = nom::number::complete::float(t)?; - text = t; - - normals[i + (j + k * NNB) * nelr] = -val; - } + + let t = nom::character::complete::multispace0(text)?.0; + let (t, val) = nom::number::complete::float(t)?; + text = t; + normals.x[i + j * nelr] = -val; + + let t = nom::character::complete::multispace0(text)?.0; + let (t, val) = nom::number::complete::float(t)?; + text = t; + normals.y[i + j * nelr] = -val; + + let t = nom::character::complete::multispace0(text)?.0; + let (t, val) = nom::number::complete::float(t)?; + text = t; + normals.z[i + j * nelr] = -val; } } @@ -227,9 +265,9 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo for j in 0..NNB { elements_surrounding_elements[i + j * nelr] = elements_surrounding_elements[last + j * nelr]; - for k in 0..NDIM { - normals[i + (j + k * NNB) * nelr] = normals[last + (j + k * NNB) * nelr]; - } + normals.x[i + j * nelr] = normals.x[last + j * nelr]; + normals.y[i + j * nelr] = normals.y[last + j * nelr]; + normals.z[i + j * nelr] = normals.z[last + j * nelr]; } } @@ -245,13 +283,37 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo )) } -pub fn initialize_variables(nelr: usize, ff_variable: &[f32]) -> AlignedSlice<f32> { - let mut variables = AlignedSlice::of_len(nelr * NVAR); +#[derive(Clone)] +pub struct Momentum { + pub x: AlignedSlice<f32>, + pub y: AlignedSlice<f32>, + pub z: AlignedSlice<f32>, +} + +#[derive(Clone)] +pub struct Variables { + pub density: AlignedSlice<f32>, + pub momentum: Momentum, + pub energy: AlignedSlice<f32>, +} + +pub fn initialize_variables(nelr: usize, ff_variable: &Variable) -> Variables { + let mut variables = Variables { + density: AlignedSlice::of_len(nelr), + momentum: Momentum { + x: AlignedSlice::of_len(nelr), + y: AlignedSlice::of_len(nelr), + z: AlignedSlice::of_len(nelr), + }, + energy: AlignedSlice::of_len(nelr), + }; for i in 0..nelr { - for j in 0..NVAR { - variables[i + j * nelr] = ff_variable[j]; - } + variables.density[i] = ff_variable.density; + variables.momentum.x[i] = ff_variable.momentum.x; + variables.momentum.y[i] = ff_variable.momentum.y; + variables.momentum.z[i] = ff_variable.momentum.z; + variables.energy[i] = ff_variable.energy; } variables diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index 289548f9e01cdf402a3e1b1057fa52d4029f6173..f736c0b776759596cfc93a5025fb4f280c3fc932 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -54,4 +54,10 @@ ip-sroa(*); sroa(*); simpl!(*); +fork-dim-merge(main_loops); +fork-tile[32, 0, false, true](main_loops); +dce(main_loops); +fork-split(main_loops); +simpl!(main_loops); + gcm(*); diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index 9d020c64ccef3b9c0a79694876c5b0ace606f938..8b68ed7111de9009458dbdc26b511e66ee13db58 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -22,6 +22,7 @@ pub enum ScheduleCompilerError { actual: usize, loc: Location, }, + SemanticError(String, Location), } impl fmt::Display for ScheduleCompilerError { @@ -46,6 +47,11 @@ impl fmt::Display for ScheduleCompilerError { "({}, {}) -- ({}, {}): Expected {} arguments, found {}", loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, expected, actual ), + ScheduleCompilerError::SemanticError(msg, loc) => write!( + f, + "({}, {}) -- ({}, {}): {}", + loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, msg, + ), } } } @@ -76,6 +82,8 @@ enum Appliable { // DeleteUncalled requires special handling because it changes FunctionIDs, so it is not // treated like a pass DeleteUncalled, + // Test whether a feature is enabled + Feature, Schedule(Schedule), Device(Device), } @@ -85,6 +93,8 @@ impl Appliable { fn is_valid_num_args(&self, num: usize) -> bool { match self { Appliable::Pass(pass) => pass.is_valid_num_args(num), + // Testing whether a feature is enabled takes the feature instead of a selection, so it + // has 0 arguments // Delete uncalled, Schedules, and devices do not take arguments _ => num == 0, } @@ -158,6 +168,8 @@ impl FromStr for Appliable { "serialize" => Ok(Appliable::Pass(ir::Pass::Serialize)), "write-predication" => Ok(Appliable::Pass(ir::Pass::WritePredication)), + "feature" => Ok(Appliable::Feature), + "print" => Ok(Appliable::Pass(ir::Pass::Print)), "cpu" | "llvm" => Ok(Appliable::Device(Device::LLVM)), @@ -275,6 +287,35 @@ fn compile_stmt( limit, }]) } + parser::Stmt::IfThenElse { + span: _, + cond, + thn, + els, + } => { + let cond = compile_exp_as_expr(cond, lexer, macrostab, macros)?; + + macros.open_scope(); + let thn = ir::ScheduleStmt::Block { + body: compile_ops_as_block(*thn, lexer, macrostab, macros)?, + }; + macros.close_scope(); + + macros.open_scope(); + let els = match els { + Some(els) => ir::ScheduleStmt::Block { + body: compile_ops_as_block(*els, lexer, macrostab, macros)?, + }, + None => ir::ScheduleStmt::Block { body: vec![] }, + }; + macros.close_scope(); + + Ok(vec![ir::ScheduleStmt::IfThenElse { + cond, + thn: Box::new(thn), + els: Box::new(els), + }]) + } parser::Stmt::MacroDecl { span: _, def } => { let parser::MacroDecl { name, @@ -380,6 +421,17 @@ fn compile_expr( on: selection, })) } + Appliable::Feature => match selection { + ir::Selector::Selection(mut args) if args.len() == 1 => { + Ok(ExprResult::Expr(ir::ScheduleExp::Feature { + feature: Box::new(args.pop().unwrap()), + })) + } + _ => Err(ScheduleCompilerError::SemanticError( + "feature requires exactly one argument as its selection".to_string(), + lexer.line_col(span), + )), + }, Appliable::Schedule(sched) => Ok(ExprResult::Stmt(ir::ScheduleStmt::AddSchedule { sched, on: selection, diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index ab1495b816c99452560d03c0addf77a5aec18974..bacb4142c62df5b0bf974bcb1703400dd3caa02c 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -121,6 +121,9 @@ pub enum ScheduleExp { DeleteUncalled { on: Selector, }, + Feature { + feature: Box<ScheduleExp>, + }, Record { fields: Vec<(String, ScheduleExp)>, }, @@ -180,4 +183,9 @@ pub enum ScheduleStmt { device: Device, on: Selector, }, + IfThenElse { + cond: ScheduleExp, + thn: Box<ScheduleStmt>, + els: Box<ScheduleStmt>, + }, } diff --git a/juno_scheduler/src/lang.l b/juno_scheduler/src/lang.l index af154fce3d489b7b078607e52888d76d08f8e5fe..1f4f8723de4d950920d503a11c9145ad1d15d24a 100644 --- a/juno_scheduler/src/lang.l +++ b/juno_scheduler/src/lang.l @@ -20,12 +20,15 @@ \. "." apply "apply" +else "else" fixpoint "fixpoint" +if "if" let "let" macro "macro_keyword" on "on" set "set" target "target" +then "then" true "true" false "false" diff --git a/juno_scheduler/src/lang.y b/juno_scheduler/src/lang.y index 451f035b8122ac1a694567327a44776c9326fff2..55c82b9dc368e7cade9500fb852c9d48320be7d2 100644 --- a/juno_scheduler/src/lang.y +++ b/juno_scheduler/src/lang.y @@ -27,6 +27,10 @@ Stmt -> Stmt { Stmt::ExprStmt { span: $span, exp: $1 } } | 'fixpoint' FixpointLimit '{' Schedule '}' { Stmt::Fixpoint { span: $span, limit: $2, body: Box::new($4) } } + | 'if' Expr '{' Schedule '}' + { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: None } } + | 'if' Expr '{' Schedule '}' 'else' '{' Schedule '}' + { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: Some(Box::new($8)) } } | MacroDecl { Stmt::MacroDecl { span: $span, def: $1 } } ; @@ -163,6 +167,7 @@ pub enum Stmt { AssignStmt { span: Span, var: Span, rhs: Expr }, ExprStmt { span: Span, exp: Expr }, Fixpoint { span: Span, limit: FixpointLimit, body: Box<OperationList> }, + IfThenElse { span: Span, cond: Expr, thn: Box<OperationList>, els: Option<Box<OperationList>> }, MacroDecl { span: Span, def: MacroDecl }, } diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 456df2eda49b93a6c80327a090b6f6606ae711bb..4c981bd98ebd8eb605352cd3a78128c72e0e4ae9 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -16,6 +16,7 @@ use juno_utils::stringtab::StringTable; use std::cell::RefCell; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::env; use std::fmt; use std::fs::File; use std::io::Write; @@ -1090,7 +1091,9 @@ impl PassManager { let mut nvcc_process = Command::new("nvcc") .arg("-c") + .arg("-Xptxas") .arg("-O3") + .arg("-use_fast_math") .arg("-diag-suppress") .arg("177") .arg("-o") @@ -1197,6 +1200,23 @@ fn schedule_interpret( // were made Ok(i > 1) } + ScheduleStmt::IfThenElse { cond, thn, els } => { + let (cond, modified) = interp_expr(pm, cond, stringtab, env, functions)?; + let Value::Boolean { val: cond } = cond else { + return Err(SchedulerError::SemanticError( + "Condition must be a boolean value".to_string(), + )); + }; + let changed = schedule_interpret( + pm, + if cond { &*thn } else { &*els }, + stringtab, + env, + functions, + )?; + + Ok(modified || changed) + } ScheduleStmt::Block { body } => { let mut modified = false; env.open_scope(); @@ -1441,6 +1461,23 @@ fn interp_expr( changed, )) } + ScheduleExp::Feature { feature } => { + let (feature, modified) = interp_expr(pm, &*feature, stringtab, env, functions)?; + let Value::String { val } = feature else { + return Err(SchedulerError::SemanticError( + "Feature expects a single string argument (instead of a selection)".to_string(), + )); + }; + // To test for features, the scheduler needs to be invoked from a build script so that + // Cargo provides the enabled features via environment variables + let key = "CARGO_FEATURE_".to_string() + &val.to_uppercase().replace("-", "_"); + Ok(( + Value::Boolean { + val: env::var(key).is_ok(), + }, + modified, + )) + } ScheduleExp::Record { fields } => { let mut result = HashMap::new(); let mut changed = false;