Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (2)
Showing
with 246 additions and 109 deletions
/target
**/target
*.dot
!paper_resources/*.dot
*.bc
......@@ -14,3 +14,4 @@
.vscode
*_env
*.txt
*ncu-rep
\ No newline at end of file
......@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
// because Fork basic block's init section already does gating
write!(
w,
"{}{} = (threadIdx.x % {}) / {};\n",
"{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
tabs,
define_variable,
use_thread_quota.unwrap(),
use_thread_quota.unwrap() / parallel_factor.unwrap()
use_thread_quota.unwrap() / parallel_factor.unwrap(),
divide,
modulo,
)?;
}
}
......
......@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Rem,
});
edit.sub_edit(tid, rem);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, rem)?;
} else if tid_dim == inner_idx {
let outer_tid = Node::ThreadID {
......@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Div,
});
edit.sub_edit(tid, div);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, div)?;
}
}
......@@ -1479,7 +1481,12 @@ fn fork_fusion(
}
// Perform the fusion.
let bottom_tids: Vec<_> = editor
.get_users(bottom_fork)
.filter(|id| nodes[id.idx()].is_thread_id())
.collect();
editor.edit(|mut edit| {
edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
if bottom_join_pred != bottom_fork {
// If there is control flow in the bottom fork-join, stitch it into
// the top fork-join.
......
......@@ -85,7 +85,6 @@ pub fn interprocedural_sroa(
param_nodes[idx].push(id);
}
}
println!("{}", editor.func().name);
let success = editor.edit(|mut edit| {
for (idx, ids) in param_nodes.into_iter().enumerate() {
let new_indices = &old_param_type_map[idx];
......
......@@ -447,7 +447,7 @@ pub fn sroa(
field_map.insert(node, generate_reads(editor, types[&node], node));
}
Node::Constant { id } => {
field_map.insert(node, generate_constant_fields(editor, id));
field_map.insert(node, generate_constant_fields(editor, id, node));
to_delete.push(node);
}
_ => {
......@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
// Given a constant cnst adds node to the function which are the constant values of each field and
// returns a list of pairs of indices and the node that holds that index
fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
fn generate_constant_fields(
editor: &mut FunctionEditor,
cnst: ConstantID,
old_node: NodeID,
) -> IndexTree<NodeID> {
let cs: Option<Vec<ConstantID>> =
if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
Some(cs.into())
......@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
if let Some(cs) = cs {
let mut fields = vec![];
for c in cs {
fields.push(generate_constant_fields(editor, c));
fields.push(generate_constant_fields(editor, c, old_node));
}
IndexTree::Node(fields)
} else {
let mut node = None;
editor.edit(|mut edit| {
node = Some(edit.add_node(Node::Constant { id: cnst }));
edit.sub_edit(old_node, node.unwrap());
Ok(edit)
});
IndexTree::Leaf(node.expect("Add node cannot fail"))
......
......@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@image_loop for r = 0 to row {
for c = 0 to col {
@l2 let l2_dist : f32[num_ctrl_pts];
for cp = 0 to num_ctrl_pts {
@cp_loop for cp = 0 to num_ctrl_pts {
let v1 = input[0, r, c] - ctrl_pts[cp, 0];
let v2 = input[1, r, c] - ctrl_pts[cp, 1];
let v3 = input[2, r, c] - ctrl_pts[cp, 2];
......@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@channel_loop for chan = 0 to CHAN {
let chan_val : f32 = 0.0;
for cp = 0 to num_ctrl_pts {
@cp_loop for cp = 0 to num_ctrl_pts {
chan_val += l2_dist[cp] * weights[cp, chan];
}
......
......@@ -115,7 +115,7 @@ array-slf(fuse4);
simpl!(fuse4);
let par = fuse4@image_loop \ fuse4@channel_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let fuse4_body = outline(split.cava_3.fj2);
......
......@@ -117,9 +117,9 @@ fixpoint {
simpl!(fuse4);
array-slf(fuse4);
simpl!(fuse4);
//fork-tile[2, 0, false, true](fuse4@channel_loop);
//fork-split(fuse4@channel_loop);
//clean-monoid-reduces(fuse4);
fork-tile[2, 0, false, true](fuse4@channel_loop);
let out = fork-split(fuse4@channel_loop);
fork-unroll(out.cava_3.fj1);
unforkify(fuse4@channel_loop);
no-memset(fuse5@res1);
......@@ -133,6 +133,13 @@ simpl!(fuse5);
array-slf(fuse5);
simpl!(fuse5);
fork-tile[4, 1, false, true](fuse4);
fork-tile[8, 0, false, true](fuse4);
fork-interchange[1, 2](fuse4);
let split = fork-split(fuse4);
fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
fork-coalesce(split.cava_3.fj2);
delete-uncalled(*);
simpl!(*);
......
......@@ -124,9 +124,9 @@ pub struct CavaInputs {
#[clap(long = "output-verify", value_name = "PATH")]
pub output_verify: Option<String>,
pub cam_model: String,
#[clap(short, long)]
#[clap(long)]
pub crop_rows: Option<usize>,
#[clap(short, long)]
#[clap(long)]
pub crop_cols: Option<usize>,
}
......
......@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
simpl!(gaussian_smoothing, gaussian_smoothing_body);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
no-memset(laplacian_estimate@res);
fixpoint {
forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate);
......@@ -42,15 +42,15 @@ fixpoint {
simpl!(laplacian_estimate);
let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
let body = split._1_laplacian_estimate.fj2;
let laplacian_estimate_body = outline(body);
fork-coalesce(laplacian_estimate, laplacian_estimate_body);
simpl!(laplacian_estimate, laplacian_estimate_body);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
no-memset(zero_crossings@res);
fixpoint {
forkify(zero_crossings);
fork-guard-elim(zero_crossings);
......@@ -59,10 +59,10 @@ fixpoint {
simpl!(zero_crossings);
let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
let body = split._2_zero_crossings.fj2;
let zero_crossings_body = outline(body);
fork-coalesce(zero_crossings, zero_crossings_body);
simpl!(zero_crossings, zero_crossings_body);
......@@ -86,7 +86,7 @@ fixpoint {
simpl!(max_gradient);
fork-dim-merge(max_gradient);
simpl!(max_gradient);
fork-tile[16, 0, false, false](max_gradient);
fork-tile[32, 0, false, false](max_gradient);
let split = fork-split(max_gradient);
clean-monoid-reduces(max_gradient);
let out = outline(split._4_max_gradient.fj1);
......@@ -105,7 +105,7 @@ fixpoint {
predication(reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-tile[4, 1, false, false](reject_zero_crossings);
fork-tile[4, 0, false, false](reject_zero_crossings);
fork-tile[8, 0, false, false](reject_zero_crossings);
fork-interchange[1, 2](reject_zero_crossings);
let split = fork-split(reject_zero_crossings);
let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
......
......@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
@image_loop for row = 0 to n {
for col = 0 to m {
// Copy data for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
}
}
// Compute pixel of dilated image
let dilated = MIN_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
}
}
// Data copy for erotion filter
@shr2 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
dilated = max!(dilated, filter * structure[i, j]);
}
}
......@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
let eroded = MAX_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
eroded = min!(eroded, filter * structure[i, j]);
}
}
......@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
@image_loop for row = 0 to n {
for col = 0 to m {
// Data copy for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
}
}
// Compute the pixel of dilated image
let dilated = MIN_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
}
}
// Data copy for erotion filter
@shr2 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
dilated = max!(dilated, filter * structure[i, j]);
}
}
......@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
let eroded = MAX_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
eroded = min!(eroded, filter * structure[i, j]);
}
}
......@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
let gx = 0;
let gy = 0;
for i = 0 to sb {
@filter_loop for i = 0 to sb {
for j = 0 to sb {
let val = input[if row + i < sbr then 0
else if row + i - sbr > n - 1 then n - 1
......
......@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-dim-merge(gaussian_smoothing@filter_loop);
unforkify(gaussian_smoothing@filter_loop);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
fork-dim-merge(gaussian_smoothing);
fork-tile[32, 0, false, true](gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-split(gaussian_smoothing);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res);
fixpoint {
forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate);
fork-coalesce(laplacian_estimate);
}
simpl!(laplacian_estimate);
fork-dim-merge(laplacian_estimate@filter_loop);
unforkify(laplacian_estimate@filter_loop);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
fork-dim-merge(laplacian_estimate);
fork-tile[32, 0, false, true](laplacian_estimate);
simpl!(laplacian_estimate);
fork-split(laplacian_estimate);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res);
fixpoint {
forkify(zero_crossings);
fork-guard-elim(zero_crossings);
fork-coalesce(zero_crossings);
}
simpl!(zero_crossings);
fork-dim-merge(zero_crossings@filter_loop);
unforkify(zero_crossings@filter_loop);
simpl!(zero_crossings);
fork-dim-merge(zero_crossings);
fork-tile[32, 0, false, true](zero_crossings);
simpl!(zero_crossings);
fork-split(zero_crossings);
simpl!(zero_crossings);
no-memset(gradient@res);
fixpoint {
......@@ -53,6 +80,15 @@ predication(gradient);
simpl!(gradient);
predication(gradient);
simpl!(gradient);
fork-dim-merge(gradient@filter_loop);
unforkify(gradient@filter_loop);
simpl!(gradient);
fork-dim-merge(gradient);
fork-tile[32, 0, false, true](gradient);
simpl!(gradient);
fork-split(gradient);
simpl!(gradient);
fixpoint {
forkify(max_gradient);
......@@ -88,6 +124,12 @@ fixpoint {
predication(reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-dim-merge(reject_zero_crossings);
fork-tile[32, 0, false, true](reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-split(reject_zero_crossings);
simpl!(reject_zero_crossings);
async-call(edge_detection@le, edge_detection@zc);
simpl!(*);
......
......@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*);
simpl!(*);
ip-sroa[true](*);
......@@ -24,8 +25,31 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
unforkify(compute_flux@inner_loop);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._6_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._7_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._8_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(*);
unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*);
......@@ -24,7 +24,38 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
unforkify(compute_flux@inner_loop);
unforkify(*);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux_contributions);
let split = fork-split(compute_flux_contributions);
let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
simpl!(compute_flux_contributions, compute_flux_contributions_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._7_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._8_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._9_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*);
......@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux, time_step);
......@@ -26,9 +27,18 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
unforkify(compute_flux@inner_loop);
fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
unforkify(*);
float-collections(*);
gcm(*);
......@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step);
......@@ -26,7 +27,21 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
unforkify(compute_flux@inner_loop);
fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux_contributions);
fork-split(compute_flux_contributions);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
unforkify(*);
float-collections(*);
gcm(*);
......@@ -48,8 +48,7 @@ fn run_euler(
let normals_z = HerculesImmBox::from(normals.z.as_slice());
let mut runner = runner!(euler);
let (density, momentum_x, momentum_y, momentum_z, energy) =
async_std::task::block_on(async {
let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
runner
.run(
nelr as u64,
......@@ -123,8 +122,7 @@ fn run_pre_euler(
let normals_z = HerculesImmBox::from(normals.z.as_slice());
let mut runner = runner!(pre_euler);
let (density, momentum_x, momentum_y, momentum_z, energy) =
async_std::task::block_on(async {
let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
runner
.run(
nelr as u64,
......@@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool {
let ys_energy = ys.energy.as_slice();
xs_density.len() == ys_density.len()
&& xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y))
&& xs_density
.iter()
.zip(ys_density.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_x.len() == ys_momentum_x.len()
&& xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_x
.iter()
.zip(ys_momentum_x.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_y.len() == ys_momentum_y.len()
&& xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_y
.iter()
.zip(ys_momentum_y.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_z.len() == ys_momentum_z.len()
&& xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_z
.iter()
.zip(ys_momentum_z.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_energy.len() == ys_energy.len()
&& xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y))
&& xs_energy
.iter()
.zip(ys_energy.iter())
.all(|(x, y)| compare_float(*x, *y))
}
pub fn cfd_harness(args: CFDInputs) {
......@@ -224,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) {
} = read_domain_geometry(data_file, block_size);
let variables = initialize_variables(nelr, &ff_variable);
println!("Running CFD with nelr = {}.", nelr);
let res_juno = if pre_euler {
run_pre_euler(
......
......@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
}
fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
let step_factors : f32[nelr];
@res let step_factors : f32[nelr];
for i in 0..nelr {
let density = variables.density[i];
......@@ -109,10 +109,10 @@ fn compute_flux_contribution(
fn compute_flux_contributions<nelr: usize>(
variables: Variables::<nelr>,
) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
let fc_momentum_x: Momentum::<nelr>;
let fc_momentum_y: Momentum::<nelr>;
let fc_momentum_z: Momentum::<nelr>;
let fc_density_energy: Momentum::<nelr>;
@res let fc_momentum_x: Momentum::<nelr>;
@res let fc_momentum_y: Momentum::<nelr>;
@res let fc_momentum_z: Momentum::<nelr>;
@res let fc_density_energy: Momentum::<nelr>;
for i in 0..nelr {
let density_i = variables.density[i];
......@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>(
ff_fc_momentum_z: float3,
) -> Variables::<nelr> {
const smoothing_coefficient : f32 = 0.2;
let fluxes: Variables::<nelr>;
@res let fluxes: Variables::<nelr>;
for i in 0..nelr {
@outer_loop for i in 0..nelr {
let density_i = variables.density[i];
let momentum_i = float3 { x: variables.momentum.x[i],
......@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>(
let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
let flux_i_density_energy : f32 = 0.0;
for j in 0..NNB {
@inner_loop for j in 0..NNB {
let nb = elements_surrounding_elements[j, i];
let normal = float3 {
x: normals.x[j, i],
......@@ -328,7 +328,7 @@ fn time_step<nelr: usize>(
}
fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
let result : Variables::<nelr>;
@res let result : Variables::<nelr>;
for i in 0..nelr {
result.density[i] = variables.density[i];
......
......@@ -54,4 +54,10 @@ ip-sroa(*);
sroa(*);
simpl!(*);
fork-dim-merge(main_loops);
fork-tile[32, 0, false, true](main_loops);
dce(main_loops);
fork-split(main_loops);
simpl!(main_loops);
gcm(*);
......@@ -1090,7 +1090,9 @@ impl PassManager {
let mut nvcc_process = Command::new("nvcc")
.arg("-c")
.arg("-Xptxas")
.arg("-O3")
.arg("-use_fast_math")
.arg("-diag-suppress")
.arg("177")
.arg("-o")
......