Compare revisions

902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc
--- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(pre_euler);
+gpu(auto.pre_euler);
+
+inline(auto.pre_euler);
+inline(auto.pre_euler);
+delete-uncalled(*);
+
+sroa[false](auto.pre_euler);
+dce(*);
+float-collections(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/cfd/src/main.rs
+++ b/juno_samples/rodinia/cfd/src/main.rs
+#![feature(concat_idents)]
+mod rust_cfd;
+mod setup;
+
+use clap::Parser;
+
+use crate::setup::*;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
+
+juno_build::juno!("euler");
+juno_build::juno!("pre_euler");
+
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct CFDInputs {
+    data_file: String,
+    iterations: usize,
+    block_size: usize,
+    #[clap(short = None, long = Some("pre-euler"))]
+    pre_euler: bool,
+}
+
+fn run_euler(
+    nelr: usize,
+    iterations: usize,
+    mut variables: AlignedSlice<f32>,
+    areas: &[f32],
+    elements_surrounding_elements: &[i32],
+    normals: &[f32],
+    ff_variable: &[f32],
+    ff_fc_density_energy: &Float3,
+    ff_fc_momentum_x: &Float3,
+    ff_fc_momentum_y: &Float3,
+    ff_fc_momentum_z: &Float3,
+) -> Vec<f32> {
+    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let areas = HerculesImmBox::from(areas);
+    let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements);
+    let normals = HerculesImmBox::from(normals);
+    let ff_variable = HerculesImmBox::from(ff_variable);
+
+    // TODO: Make hercules box handle structs, for now we'll copy into a vec
+    let ff_fc_density_energy = vec![
+        ff_fc_density_energy.x,
+        ff_fc_density_energy.y,
+        ff_fc_density_energy.z,
+    ];
+    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
+    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
+    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
+    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
+    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
+    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
+    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let mut runner = runner!(euler);
+
+    HerculesMutBox::from(async_std::task::block_on(async {
+        runner
+            .run(
+                nelr as u64,
+                iterations as u64,
+                variables.to(),
+                areas.to(),
+                elements_surrounding_elements.to(),
+                normals.to(),
+                ff_variable.to(),
+                ff_fc_density_energy.to(),
+                ff_fc_momentum_x.to(),
+                ff_fc_momentum_y.to(),
+                ff_fc_momentum_z.to(),
+            )
+            .await
+    }))
+    .as_slice()
+    .to_vec()
+}
+
+fn run_pre_euler(
+    nelr: usize,
+    iterations: usize,
+    mut variables: AlignedSlice<f32>,
+    areas: &[f32],
+    elements_surrounding_elements: &[i32],
+    normals: &[f32],
+    ff_variable: &[f32],
+    ff_fc_density_energy: &Float3,
+    ff_fc_momentum_x: &Float3,
+    ff_fc_momentum_y: &Float3,
+    ff_fc_momentum_z: &Float3,
+) -> Vec<f32> {
+    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let areas = HerculesImmBox::from(areas);
+    let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements);
+    let normals = HerculesImmBox::from(normals);
+    let ff_variable = HerculesImmBox::from(ff_variable);
+
+    // TODO: Make hercules box handle structs, for now we'll copy into a vec
+    let ff_fc_density_energy = vec![
+        ff_fc_density_energy.x,
+        ff_fc_density_energy.y,
+        ff_fc_density_energy.z,
+    ];
+    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
+    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
+    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
+    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
+    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
+    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
+    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let mut runner = runner!(pre_euler);
+
+    let variables = variables.to();
+
+    HerculesMutBox::from(async_std::task::block_on(async {
+        runner
+            .run(
+                nelr as u64,
+                iterations as u64,
+                variables,
+                areas.to(),
+                elements_surrounding_elements.to(),
+                normals.to(),
+                ff_variable.to(),
+                ff_fc_density_energy.to(),
+                ff_fc_momentum_x.to(),
+                ff_fc_momentum_y.to(),
+                ff_fc_momentum_z.to(),
+            )
+            .await
+    }))
+    .as_slice()
+    .to_vec()
+}
+
+fn compare_float(x: f32, y: f32) -> bool {
+    (x - y).abs() < 1e-5
+}
+
+fn compare_floats(xs: &[f32], ys: &[f32]) -> bool {
+    xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y))
+}
+
+fn cfd_harness(args: CFDInputs) {
+    let CFDInputs {
+        data_file,
+        iterations,
+        block_size,
+        pre_euler,
+    } = args;
+
+    assert!(block_size % 8 == 0, "Hercules expects all arrays to be 32-byte aligned, cfd uses structs of arrays that are annoying to deal with if the block_size is not a multiple of 8");
+
+    let FarFieldConditions {
+        ff_variable,
+        ff_fc_momentum_x,
+        ff_fc_momentum_y,
+        ff_fc_momentum_z,
+        ff_fc_density_energy,
+    } = set_far_field_conditions();
+
+    let GeometryData {
+        nelr,
+        areas,
+        elements_surrounding_elements,
+        normals,
+    } = read_domain_geometry(data_file, block_size);
+
+    let variables = initialize_variables(nelr, ff_variable.as_slice());
+
+    let res_juno = if pre_euler {
+        run_pre_euler(
+            nelr,
+            iterations,
+            variables.clone(),
+            areas.as_slice(),
+            elements_surrounding_elements.as_slice(),
+            normals.as_slice(),
+            ff_variable.as_slice(),
+            &ff_fc_density_energy,
+            &ff_fc_momentum_x,
+            &ff_fc_momentum_y,
+            &ff_fc_momentum_z,
+        )
+    } else {
+        run_euler(
+            nelr,
+            iterations,
+            variables.clone(),
+            areas.as_slice(),
+            elements_surrounding_elements.as_slice(),
+            normals.as_slice(),
+            ff_variable.as_slice(),
+            &ff_fc_density_energy,
+            &ff_fc_momentum_x,
+            &ff_fc_momentum_y,
+            &ff_fc_momentum_z,
+        )
+    };
+    let res_rust = if pre_euler {
+        rust_cfd::pre_euler(
+            nelr,
+            iterations,
+            variables,
+            areas.as_slice(),
+            elements_surrounding_elements.as_slice(),
+            normals.as_slice(),
+            ff_variable.as_slice(),
+            &ff_fc_density_energy,
+            &ff_fc_momentum_x,
+            &ff_fc_momentum_y,
+            &ff_fc_momentum_z,
+        )
+    } else {
+        rust_cfd::euler(
+            nelr,
+            iterations,
+            variables,
+            areas.as_slice(),
+            elements_surrounding_elements.as_slice(),
+            normals.as_slice(),
+            ff_variable.as_slice(),
+            &ff_fc_density_energy,
+            &ff_fc_momentum_x,
+            &ff_fc_momentum_y,
+            &ff_fc_momentum_z,
+        )
+    };
+
+    if !compare_floats(&res_juno, res_rust.as_slice()) {
+        assert_eq!(res_juno.len(), res_rust.as_slice().len());
+        panic!("Mismatch in results");
+    }
+}
+
+fn main() {
+    let args = CFDInputs::parse();
+    cfd_harness(args);
+}
+
+#[test]
+fn test_euler() {
+    cfd_harness(CFDInputs {
+        data_file: "data/fvcorr.domn.097K".to_string(),
+        iterations: 1,
+        block_size: 8,
+        pre_euler: false,
+    });
+}
+
+#[test]
+fn test_pre_euler() {
+    cfd_harness(CFDInputs {
+        data_file: "data/fvcorr.domn.097K".to_string(),
+        iterations: 1,
+        block_size: 8,
+        pre_euler: true,
+    });
+}
--- a/juno_samples/rodinia/cfd/src/pre_euler.jn
+++ b/juno_samples/rodinia/cfd/src/pre_euler.jn
+// The Rodinia version of this kernel reshapes a lot of the arrays, in particular
+// it stores Variables and Momentum in an Array of Structs form rather than
+// Struct of Arrays.
+// It stores normals as float3[nelr, NNB]
+// It also stores elements_surrounding_elements as i32[nelr, NNB] rather
+// than i32[NNB, i32]
+// The HPVM version does not make this change, and neither am I going to, but
+// we should consider whether this transformation has performance benefits in
+// this case and if so consider implementing the necessary transformations to
+// achieve it without changing the code
+
+const NNB : usize = 4;
+
+type Normals<nelr: usize> = struct {
+  x: f32[NNB, nelr],
+  y: f32[NNB, nelr],
+  z: f32[NNB, nelr],
+};
+
+type Momentum<nelr: usize> = struct {
+  x: f32[nelr],
+  y: f32[nelr],
+  z: f32[nelr],
+};
+
+type Variables<nelr: usize> = struct {
+  density: f32[nelr],
+  momentum: Momentum::<nelr>,
+  energy: f32[nelr],
+};
+
+type float3 = struct { x: f32, y: f32, z: f32 };
+
+type Variable = struct {
+  density: f32,
+  momentum: float3,
+  energy: f32,
+};
+
+fn compute_velocity(density: f32, momentum: float3) -> float3 {
+  return float3 { x: momentum.x / density,
+                  y: momentum.y / density,
+                  z: momentum.z / density };
+}
+
+fn compute_speed_sqd(velocity: float3) -> f32 {
+  return velocity.x * velocity.x + velocity.y * velocity.y + velocity.z * velocity.z;
+}
+
+const GAMMA : f32 = 1.4;
+
+fn compute_pressure(density: f32, density_energy: f32, speed_sqd: f32) -> f32 {
+  return (GAMMA - 1.0) * (density_energy - 0.5 * density * speed_sqd);
+}
+
+fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
+  return sqrt!(GAMMA * pressure / density);
+}
+
+fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
+  let step_factors : f32[nelr];
+
+  for i in 0..nelr {
+    let density = variables.density[i];
+
+    let momentum : float3;
+    momentum.x = variables.momentum.x[i];
+    momentum.y = variables.momentum.y[i];
+    momentum.z = variables.momentum.z[i];
+
+    let density_energy = variables.energy[i];
+
+    let velocity       = compute_velocity(density, momentum);
+    let speed_sqd      = compute_speed_sqd(velocity);
+    let pressure       = compute_pressure(density, density_energy, speed_sqd);
+    let speed_of_sound = compute_speed_of_sound(density, pressure);
+
+    step_factors[i] = 0.5 / (sqrt!(areas[i]) * (sqrt!(speed_sqd) + speed_of_sound));
+  }
+
+  return step_factors;
+}
+
+fn compute_flux_contribution(
+  density: f32,
+  momentum: float3,
+  density_energy: f32,
+  pressure: f32,
+  velocity: float3,
+) -> (float3, float3, float3, float3) {
+  let fc_momentum_x = float3 { x: velocity.x * momentum.x + pressure,
+                               y: velocity.x * momentum.y,
+                               z: velocity.x * momentum.z };
+  let fc_momentum_y = float3 { x: fc_momentum_x.y,
+                               y: velocity.y * momentum.y + pressure,
+                               z: velocity.y * momentum.z };
+  let fc_momentum_z = float3 { x: fc_momentum_x.z,
+                               y: fc_momentum_y.z,
+                               z: velocity.z * momentum.z + pressure };
+  
+  let de_p = density_energy + pressure;
+  let fc_density_energy = float3 { x: velocity.x * de_p,
+                                   y: velocity.y * de_p,
+                                   z: velocity.z * de_p };
+
+  return (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
+}
+
+fn compute_flux_contributions<nelr: usize>(
+  variables: Variables::<nelr>,
+) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
+  let fc_momentum_x: Momentum::<nelr>;
+  let fc_momentum_y: Momentum::<nelr>;
+  let fc_momentum_z: Momentum::<nelr>;
+  let fc_density_energy: Momentum::<nelr>;
+
+  for i in 0..nelr {
+    let density_i = variables.density[i];
+
+    let momentum_i = float3 { x: variables.momentum.x[i],
+                              y: variables.momentum.y[i],
+                              z: variables.momentum.z[i] };
+
+    let density_energy_i = variables.energy[i];
+
+    let velocity_i       = compute_velocity(density_i, momentum_i);
+    let speed_sqd_i      = compute_speed_sqd(velocity_i);
+    let speed_i          = sqrt!(speed_sqd_i);
+    let pressure_i       = compute_pressure(density_i, density_energy_i, speed_sqd_i);
+    let speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
+
+    let (fc_i_momentum_x, fc_i_momentum_y, fc_i_momentum_z, fc_i_density_energy)
+      = compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i);
+
+    fc_momentum_x.x[i] = fc_i_momentum_x.x;
+    fc_momentum_x.y[i] = fc_i_momentum_x.y;
+    fc_momentum_x.z[i] = fc_i_momentum_x.z;
+
+    fc_momentum_y.x[i] = fc_i_momentum_y.x;
+    fc_momentum_y.y[i] = fc_i_momentum_y.y;
+    fc_momentum_y.z[i] = fc_i_momentum_y.z;
+
+    fc_momentum_z.x[i] = fc_i_momentum_z.x;
+    fc_momentum_z.y[i] = fc_i_momentum_z.y;
+    fc_momentum_z.z[i] = fc_i_momentum_z.z;
+
+    fc_density_energy.x[i] = fc_i_density_energy.x;
+    fc_density_energy.y[i] = fc_i_density_energy.y;
+    fc_density_energy.z[i] = fc_i_density_energy.z;
+  }
+
+  return (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
+}
+
+fn compute_flux<nelr: usize>(
+  variables: Variables::<nelr>,
+  elements_surrounding_elements: i32[NNB, nelr],
+  normals: Normals::<nelr>,
+  ff_variable: Variable,
+  fc_momentum_x: Momentum::<nelr>,
+  fc_momentum_y: Momentum::<nelr>,
+  fc_momentum_z: Momentum::<nelr>,
+  fc_density_energy: Momentum::<nelr>,
+  ff_fc_density_energy: float3,
+  ff_fc_momentum_x: float3,
+  ff_fc_momentum_y: float3,
+  ff_fc_momentum_z: float3,
+) -> Variables::<nelr> {
+  const smoothing_coefficient : f32 = 0.2;
+  let fluxes: Variables::<nelr>;
+
+  for i in 0..nelr {
+    let density_i = variables.density[i];
+
+    let momentum_i = float3 { x: variables.momentum.x[i],
+                              y: variables.momentum.y[i],
+                              z: variables.momentum.z[i] };
+
+    let density_energy_i = variables.energy[i];
+
+    let velocity_i       = compute_velocity(density_i, momentum_i);
+    let speed_sqd_i      = compute_speed_sqd(velocity_i);
+    let speed_i          = sqrt!(speed_sqd_i);
+    let pressure_i       = compute_pressure(density_i, density_energy_i, speed_sqd_i);
+    let speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
+
+    let fc_i_momentum_x = float3 { x: fc_momentum_x.x[i],
+                                   y: fc_momentum_x.y[i],
+                                   z: fc_momentum_x.z[i] };
+    let fc_i_momentum_y = float3 { x: fc_momentum_y.x[i],
+                                   y: fc_momentum_y.y[i],
+                                   z: fc_momentum_y.z[i] };
+    let fc_i_momentum_z = float3 { x: fc_momentum_z.x[i],
+                                   y: fc_momentum_z.y[i],
+                                   z: fc_momentum_z.z[i] };
+    let fc_i_density_energy = float3 { x: fc_density_energy.x[i],
+                                       y: fc_density_energy.y[i],
+                                       z: fc_density_energy.z[i] };
+
+    let flux_i_density : f32 = 0;
+    let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
+    let flux_i_density_energy : f32 = 0.0;
+
+    for j in 0..NNB {
+      let nb = elements_surrounding_elements[j, i];
+      let normal = float3 {
+        x: normals.x[j, i],
+        y: normals.y[j, i],
+        z: normals.z[j, i],
+      };
+      let normal_len = sqrt!(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
+
+      if nb >= 0 { // a legitimate neighbor
+        let nb = nb as usize;
+        let density_nb = variables.density[nb];
+        let momentum_nb = float3 {
+          x: variables.momentum.x[nb],
+          y: variables.momentum.y[nb],
+          z: variables.momentum.z[nb],
+        };
+        let density_energy_nb = variables.energy[nb];
+        let velocity_nb       = compute_velocity(density_nb, momentum_nb);
+        let speed_sqd_nb      = compute_speed_sqd(velocity_nb);
+        let pressure_nb       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
+        let speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
+
+        let fc_nb_momentum_x = float3 { x: fc_momentum_x.x[nb],
+                                        y: fc_momentum_x.y[nb],
+                                        z: fc_momentum_x.z[nb] };
+        let fc_nb_momentum_y = float3 { x: fc_momentum_y.x[nb],
+                                        y: fc_momentum_y.y[nb],
+                                        z: fc_momentum_y.z[nb] };
+        let fc_nb_momentum_z = float3 { x: fc_momentum_z.x[nb],
+                                        y: fc_momentum_z.y[nb],
+                                        z: fc_momentum_z.z[nb] };
+        let fc_nb_density_energy = float3 { x: fc_density_energy.x[nb],
+                                            y: fc_density_energy.y[nb],
+                                            z: fc_density_energy.z[nb] };
+
+        // artificial viscosity
+        let factor = -normal_len * smoothing_coefficient * 0.5
+                   * (speed_i + sqrt!(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
+        flux_i_density += factor * (density_i - density_nb);
+        flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
+        flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
+        flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
+        flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
+
+        // accumulate cell-centered fluxes
+        let factor = 0.5 * normal.x;
+        flux_i_density += factor * (momentum_nb.x + momentum_i.x);
+        flux_i_density_energy += factor * (fc_nb_density_energy.x + fc_i_density_energy.x);
+        flux_i_momentum.x += factor * (fc_nb_momentum_x.x + fc_i_momentum_x.x);
+        flux_i_momentum.y += factor * (fc_nb_momentum_y.x + fc_i_momentum_y.x);
+        flux_i_momentum.z += factor * (fc_nb_momentum_z.x + fc_i_momentum_z.x);
+
+        let factor = 0.5 * normal.y;
+        flux_i_density += factor * (momentum_nb.y + momentum_i.y);
+        flux_i_density_energy += factor * (fc_nb_density_energy.y + fc_i_density_energy.y);
+        flux_i_momentum.x += factor * (fc_nb_momentum_x.y + fc_i_momentum_x.y);
+        flux_i_momentum.y += factor * (fc_nb_momentum_y.y + fc_i_momentum_y.y);
+        flux_i_momentum.z += factor * (fc_nb_momentum_z.y + fc_i_momentum_z.y);
+
+        let factor = 0.5 * normal.z;
+        flux_i_density += factor * (momentum_nb.z + momentum_i.z);
+        flux_i_density_energy += factor * (fc_nb_density_energy.z + fc_i_density_energy.z);
+        flux_i_momentum.x += factor * (fc_nb_momentum_x.z + fc_i_momentum_x.z);
+        flux_i_momentum.y += factor * (fc_nb_momentum_y.z + fc_i_momentum_y.z);
+        flux_i_momentum.z += factor * (fc_nb_momentum_z.z + fc_i_momentum_z.z);
+      } else if nb == -1 { // a wing boundary
+        flux_i_momentum.x += normal.x * pressure_i;
+        flux_i_momentum.y += normal.y * pressure_i;
+        flux_i_momentum.z += normal.z * pressure_i;
+      } else if nb == -2 { // a far field boundary
+        let factor = 0.5 * normal.x;
+        flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x);
+        flux_i_density_energy += factor * (ff_fc_density_energy.x + fc_i_density_energy.x);
+        flux_i_momentum.x += factor * (ff_fc_momentum_x.x + fc_i_momentum_x.x);
+        flux_i_momentum.y += factor * (ff_fc_momentum_y.x + fc_i_momentum_y.x);
+        flux_i_momentum.z += factor * (ff_fc_momentum_z.x + fc_i_momentum_z.x);
+
+        let factor = 0.5 * normal.y;
+        flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y);
+        flux_i_density_energy += factor * (ff_fc_density_energy.y + fc_i_density_energy.y);
+        flux_i_momentum.x += factor * (ff_fc_momentum_x.y + fc_i_momentum_x.y);
+        flux_i_momentum.y += factor * (ff_fc_momentum_y.y + fc_i_momentum_y.y);
+        flux_i_momentum.z += factor * (ff_fc_momentum_z.y + fc_i_momentum_z.y);
+
+        let factor = 0.5 * normal.z;
+        flux_i_density += factor * (ff_variable.momentum.y + momentum_i.z);
+        flux_i_density_energy += factor * (ff_fc_density_energy.z + fc_i_density_energy.z);
+        flux_i_momentum.x += factor * (ff_fc_momentum_x.z + fc_i_momentum_x.z);
+        flux_i_momentum.y += factor * (ff_fc_momentum_y.z + fc_i_momentum_y.z);
+        flux_i_momentum.z += factor * (ff_fc_momentum_z.z + fc_i_momentum_z.z);
+      }
+    }
+
+    fluxes.density[i] = flux_i_density;
+    fluxes.momentum.x[i] = flux_i_momentum.x;
+    fluxes.momentum.y[i] = flux_i_momentum.y;
+    fluxes.momentum.z[i] = flux_i_momentum.z;
+    fluxes.energy[i] = flux_i_density_energy;
+  }
+
+  return fluxes;
+}
+
+const RK : usize = 3;
+
+fn time_step<nelr: usize>(
+  j: usize,
+  old_variables: Variables::<nelr>,
+  variables: Variables::<nelr>,
+  step_factors: f32[nelr],
+  fluxes: Variables::<nelr>,
+) -> Variables::<nelr> {
+  for i in 0..nelr {
+    let factor = step_factors[i] / (RK + 1 - j) as f32;
+
+    variables.density[i]    = old_variables.density[i]    + factor * fluxes.density[i];
+    variables.momentum.x[i] = old_variables.momentum.x[i] + factor * fluxes.momentum.x[i];
+    variables.momentum.y[i] = old_variables.momentum.y[i] + factor * fluxes.momentum.y[i];
+    variables.momentum.z[i] = old_variables.momentum.z[i] + factor * fluxes.momentum.z[i];
+    variables.energy[i]     = old_variables.energy[i]     + factor * fluxes.energy[i];
+  }
+
+  return variables;
+}
+
+fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
+  let result : Variables::<nelr>;
+
+  for i in 0..nelr {
+    result.density[i] = variables.density[i];
+    result.momentum.x[i] = variables.momentum.x[i];
+    result.momentum.y[i] = variables.momentum.y[i];
+    result.momentum.z[i] = variables.momentum.z[i];
+    result.energy[i] = variables.energy[i];
+  }
+
+  return result;
+}
+
+#[entry]
+fn pre_euler<nelr: usize>(
+  iterations: usize,
+  variables: Variables::<nelr>,
+  areas: f32[nelr],
+  elements_surrounding_elements: i32[NNB, nelr],
+  normals: Normals::<nelr>,
+  ff_variable: Variable,
+  ff_fc_density_energy: float3,
+  ff_fc_momentum_x: float3,
+  ff_fc_momentum_y: float3,
+  ff_fc_momentum_z: float3,
+) -> Variables::<nelr> {
+  for i in 0..iterations {
+    let old_variables = copy_vars::<nelr>(variables);
+    let step_factors = compute_step_factor::<nelr>(variables, areas);
+
+    for j in 0..RK {
+      let (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy)
+        = compute_flux_contributions::<nelr>(variables);
+      let fluxes = compute_flux::<nelr>(variables, elements_surrounding_elements,
+                                        normals, ff_variable, fc_momentum_x,
+                                        fc_momentum_y, fc_momentum_z, fc_density_energy,
+                                        ff_fc_density_energy, ff_fc_momentum_x,
+                                        ff_fc_momentum_y, ff_fc_momentum_z);
+      variables = time_step::<nelr>(j, old_variables, variables, step_factors, fluxes);
+    }
+  }
+
+  return variables;
+}
--- a/juno_samples/rodinia/cfd/src/rust_cfd.rs
+++ b/juno_samples/rodinia/cfd/src/rust_cfd.rs
--- a/juno_samples/rodinia/cfd/src/setup.rs
+++ b/juno_samples/rodinia/cfd/src/setup.rs
+use std::fs::File;
+use std::io::Read;
+use std::path::Path;
+use std::str::FromStr;
+
+use std::ops::{Index, IndexMut};
+
+use nom::Parser;
+
+use crate::rust_cfd;
+
+pub const GAMMA: f32 = 1.4;
+pub const ff_mach: f32 = 1.2;
+pub const NDIM: usize = 3;
+pub const NNB: usize = 4;
+pub const RK: usize = 3;
+pub const VAR_DENSITY: usize = 0;
+pub const VAR_MOMENTUM: usize = 1;
+pub const VAR_DENSITY_ENERGY: usize = VAR_MOMENTUM + NDIM;
+pub const NVAR: usize = VAR_DENSITY_ENERGY + 1;
+pub const deg_angle_of_attack: f32 = 0.0;
+
+#[repr(align(32))]
+struct Alignment([u8; 32]);
+
+// An aligned slice is stored as a boxed slice and an offset number of elements
+// that we skip over to get the desired alignment (of 32 bytes)
+pub struct AlignedSlice<T> {
+    slice: Box<[T]>,
+    offset: usize,
+    length: usize,
+}
+
+impl<T> AlignedSlice<T>
+where
+    T: Default,
+{
+    pub fn of_len(len: usize) -> Self {
+        // The maximum number of elements that may be waisted in getting the alignment we need is
+        // (32 - alignment of T) / size of T
+        let extra_elements = (32 - std::mem::align_of::<T>()) / std::mem::size_of::<T>();
+        let slice: Box<[T]> = (0..len + extra_elements)
+            .map(|_| Default::default())
+            .collect();
+        let offset = unsafe { slice.align_to::<Alignment>().0.len() };
+        assert!(offset <= extra_elements);
+
+        AlignedSlice {
+            slice,
+            offset,
+            length: len,
+        }
+    }
+}
+
+impl<T> AlignedSlice<T> {
+    pub fn as_slice(&self) -> &[T] {
+        &self.slice[self.offset..self.offset + self.length]
+    }
+
+    pub fn as_mut_slice(&mut self) -> &mut [T] {
+        &mut self.slice[self.offset..self.offset + self.length]
+    }
+}
+
+impl<T> Index<usize> for AlignedSlice<T> {
+    type Output = T;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.slice[index + self.offset]
+    }
+}
+
+impl<T> IndexMut<usize> for AlignedSlice<T> {
+    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+        &mut self.slice[index + self.offset]
+    }
+}
+
+// TODO: I don't think Default should be needed here, but it would require some
+// other constructor probably one that uses Clone an an element rather than Default
+impl<T> Clone for AlignedSlice<T>
+where
+    T: Clone + Default,
+{
+    fn clone(&self) -> Self {
+        let mut new_slice = Self::of_len(self.length);
+        new_slice.as_mut_slice().clone_from_slice(self.as_slice());
+        new_slice
+    }
+}
+
+#[repr(C)]
+pub struct Float3 {
+    pub x: f32,
+    pub y: f32,
+    pub z: f32,
+}
+
+pub struct FarFieldConditions {
+    pub ff_variable: AlignedSlice<f32>,
+    pub ff_fc_momentum_x: Float3,
+    pub ff_fc_momentum_y: Float3,
+    pub ff_fc_momentum_z: Float3,
+    pub ff_fc_density_energy: Float3,
+}
+
+pub fn set_far_field_conditions() -> FarFieldConditions {
+    let mut ff_variable: AlignedSlice<f32> = AlignedSlice::of_len(NVAR);
+
+    let angle_of_attack = std::f32::consts::PI / 180.0 * deg_angle_of_attack;
+
+    ff_variable[VAR_DENSITY] = 1.4;
+
+    let ff_pressure = 1.0;
+    let ff_speed_of_sound = (GAMMA * ff_pressure / ff_variable[VAR_DENSITY]).sqrt();
+    let ff_speed = ff_mach * ff_speed_of_sound;
+
+    let ff_velocity = Float3 {
+        x: ff_speed * angle_of_attack.cos(),
+        y: ff_speed * angle_of_attack.sin(),
+        z: 0.0,
+    };
+
+    ff_variable[VAR_MOMENTUM + 0] = ff_variable[VAR_DENSITY] * ff_velocity.x;
+    ff_variable[VAR_MOMENTUM + 1] = ff_variable[VAR_DENSITY] * ff_velocity.y;
+    ff_variable[VAR_MOMENTUM + 2] = ff_variable[VAR_DENSITY] * ff_velocity.z;
+
+    ff_variable[VAR_DENSITY_ENERGY] =
+        ff_variable[VAR_DENSITY] * (0.5 * (ff_speed * ff_speed)) + (ff_pressure / (GAMMA - 1.0));
+
+    let ff_momentum = Float3 {
+        x: ff_variable[VAR_MOMENTUM + 0],
+        y: ff_variable[VAR_MOMENTUM + 1],
+        z: ff_variable[VAR_MOMENTUM + 2],
+    };
+    let (ff_fc_momentum_x, ff_fc_momentum_y, ff_fc_momentum_z, ff_fc_density_energy) =
+        rust_cfd::compute_flux_contribution(
+            ff_variable[VAR_DENSITY],
+            &ff_momentum,
+            ff_variable[VAR_DENSITY_ENERGY],
+            ff_pressure,
+            &ff_velocity,
+        );
+
+    FarFieldConditions {
+        ff_variable,
+        ff_fc_momentum_x,
+        ff_fc_momentum_y,
+        ff_fc_momentum_z,
+        ff_fc_density_energy,
+    }
+}
+
+pub struct GeometryData {
+    pub nelr: usize,
+    pub areas: AlignedSlice<f32>,
+    pub elements_surrounding_elements: AlignedSlice<i32>,
+    pub normals: AlignedSlice<f32>,
+}
+
+pub fn read_domain_geometry<P>(path: P, block_size: usize) -> GeometryData
+where
+    P: AsRef<Path>,
+{
+    let mut file = File::open(path).expect("Error opening input file");
+    let mut contents = String::new();
+    file.read_to_string(&mut contents)
+        .expect("Error reading input file");
+
+    let mut parser = nom::combinator::all_consuming(|s| cfd_parser(block_size, s));
+    let (_, result) = parser.parse(&contents).expect("Parser error");
+
+    result
+}
+
+fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, GeometryData> {
+    let text = nom::character::complete::multispace0(text)?.0;
+    let (text, nel) = nom::character::complete::digit1(text)?;
+
+    let nel = usize::from_str(nel).unwrap();
+    let nelr = block_size * ((nel / block_size) + usize::min(1, nel % block_size));
+
+    let mut areas: AlignedSlice<f32> = AlignedSlice::of_len(nelr);
+    let mut elements_surrounding_elements: AlignedSlice<i32> = AlignedSlice::of_len(nelr * NNB);
+    let mut normals: AlignedSlice<f32> = AlignedSlice::of_len(NDIM * NNB * nelr);
+
+    // read in data
+    let mut text = text;
+    for i in 0..nel {
+        let t = nom::character::complete::multispace0(text)?.0;
+        let (t, val) = nom::number::complete::float(t)?;
+        text = t;
+
+        areas[i] = val;
+
+        for j in 0..NNB {
+            let t = nom::character::complete::multispace0(text)?.0;
+            let (t, neg) = if let Ok((t, _)) =
+                nom::character::complete::char::<&str, nom::error::Error<&str>>('-')(t)
+            {
+                (t, true)
+            } else {
+                (t, false)
+            };
+            let (t, val) = nom::character::complete::digit1(t)?;
+            text = t;
+
+            let val = i32::from_str(val).unwrap();
+            let val = if neg { -val } else { val };
+            elements_surrounding_elements[i + j * nelr] = if val < 0 { -1 } else { val } - 1; // it's coming in with Fortran numbering
+
+            for k in 0..NDIM {
+                let t = nom::character::complete::multispace0(text)?.0;
+                let (t, val) = nom::number::complete::float(t)?;
+                text = t;
+
+                normals[i + (j + k * NNB) * nelr] = -val;
+            }
+        }
+    }
+
+    // fill in remaining data
+    let last = nel - 1;
+    for i in nel..nelr {
+        areas[i] = areas[last];
+        for j in 0..NNB {
+            elements_surrounding_elements[i + j * nelr] =
+                elements_surrounding_elements[last + j * nelr];
+            for k in 0..NDIM {
+                normals[i + (j + k * NNB) * nelr] = normals[last + (j + k * NNB) * nelr];
+            }
+        }
+    }
+
+    let text = nom::character::complete::multispace0(text)?.0;
+    Ok((
+        text,
+        GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        },
+    ))
+}
+
+pub fn initialize_variables(nelr: usize, ff_variable: &[f32]) -> AlignedSlice<f32> {
+    let mut variables = AlignedSlice::of_len(nelr * NVAR);
+
+    for i in 0..nelr {
+        for j in 0..NVAR {
+            variables[i + j * nelr] = ff_variable[j];
+        }
+    }
+
+    variables
+}
--- a/juno_samples/rodinia/srad/Cargo.toml
+++ b/juno_samples/rodinia/srad/Cargo.toml
+[package]
+name = "juno_srad"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_srad"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
+[build-dependencies]
+juno_build = { path = "../../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../../juno_build" }
+hercules_rt = { path = "../../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+nom = "*"
--- a/juno_samples/rodinia/srad/build.rs
+++ b/juno_samples/rodinia/srad/build.rs
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("srad.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("srad.jn")
+        .unwrap()
+        .build()
+        .unwrap();
+}
--- a/juno_samples/rodinia/srad/data/LICENSE
+++ b/juno_samples/rodinia/srad/data/LICENSE
+LICENSE TERMS
+
+Copyright (c)2008-2011 University of Virginia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use this software or a modified version of it, please cite the most relevant among the following papers:
+
+- M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings 
+of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 
+Symposium on Computer Architecture (ISCA), June 2010.
+
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron.
+"Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium
+on Workload Characterization, Oct 2009.
+
+- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization
+for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International
+Conference on Supercomputing (ICS), June 2009.
+
+- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems
+Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International
+Symposium on Computer Architecture (ISCA), June 2009.
+
+- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA:
+A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel
+and Distributed Processing Symposium (IPDPS), May 2009.
+
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance
+Study of General Purpose Applications on Graphics Processors using CUDA" Journal of
+Parallel and Distributed Computing, Elsevier, June 2008.
--- a/juno_samples/rodinia/srad/data/README.md
+++ b/juno_samples/rodinia/srad/data/README.md
+SRAD Example from Rodinia Benchmark Suite 3.1
+
+The data provided herein are governed by the [LICENSE](./LICENSE).
--- a/juno_samples/rodinia/srad/data/image.pgm
+++ b/juno_samples/rodinia/srad/data/image.pgm
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(srad);
+gpu(auto.srad);
+
+inline(auto.srad);
+inline(auto.srad);
+delete-uncalled(*);
+
+sroa[false](auto.srad);
+dce(*);
+float-collections(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/srad/src/graphics.rs
+++ b/juno_samples/rodinia/srad/src/graphics.rs
+use std::fs::File;
+use std::io::{Read, Write};
+use std::path::Path;
+use std::str::FromStr;
+
+use nom::Parser;
+
+pub struct Image {
+    pub image: Vec<f32>,
+    pub max: f32,
+    pub rows: usize,
+    pub cols: usize,
+}
+
+pub fn read_graphics<P>(path: P) -> Image
+where
+    P: AsRef<Path>,
+{
+    let mut file = File::open(path).expect("Error opening input file");
+    let mut contents = String::new();
+    file.read_to_string(&mut contents)
+        .expect("Error reading input file");
+
+    let mut parser = nom::combinator::all_consuming(image_parser);
+    let (_, result) = parser.parse(&contents).expect("Parser error");
+
+    result
+}
+
+pub fn write_graphics<P>(path: P, image: &[f32], rows: usize, cols: usize, max: f32)
+where
+    P: AsRef<Path>,
+{
+    let mut file = File::create(path).expect("Error opening output file");
+    let f = &mut file;
+
+    write!(f, "P2\n");
+    write!(f, "{} {}\n", cols, rows);
+    write!(f, "{}\n", max as i32);
+    for i in 0..rows {
+        for j in 0..cols {
+            write!(f, "{} ", image[j * rows + i] as i32);
+        }
+        write!(f, "\n");
+    }
+}
+
+pub fn resize(
+    image: &[f32],
+    rows_in: usize,
+    cols_in: usize,
+    rows_out: usize,
+    cols_out: usize,
+) -> Vec<f32> {
+    let mut output = vec![0.0; rows_out * cols_out];
+
+    let mut j2 = 0;
+    for j in 0..cols_out {
+        if j2 >= cols_in {
+            j2 = j2 - cols_in;
+        }
+        let mut i2 = 0;
+        for i in 0..rows_out {
+            if i2 >= rows_in {
+                i2 = i2 - rows_in;
+            }
+            output[j * rows_out + i] = image[j2 * rows_in + i2];
+            i2 += 1;
+        }
+        j2 += 1;
+    }
+
+    output
+}
+
+fn image_parser<'a>(text: &'a str) -> nom::IResult<&'a str, Image> {
+    // First is the magic number "P2"
+    let text = nom::character::complete::char('P')(text)?.0;
+    let text = nom::character::complete::char('2')(text)?.0;
+    let text = nom::character::complete::multispace1(text)?.0;
+
+    // Next, we find the number of columns and rows
+    let (text, num_cols) = nom::character::complete::digit1(text)?;
+    let text = nom::character::complete::multispace1(text)?.0;
+    let (text, num_rows) = nom::character::complete::digit1(text)?;
+    let text = nom::character::complete::multispace1(text)?.0;
+
+    let cols = usize::from_str(num_cols).unwrap();
+    let rows = usize::from_str(num_rows).unwrap();
+    let mut image = vec![0.0; cols * rows];
+
+    // Next, the maximum value
+    let (text, num_max) = nom::character::complete::digit1(text)?;
+    let text = nom::character::complete::multispace1(text)?.0;
+
+    let max = f32::from_str(num_max).unwrap();
+
+    // Finally, we read each pixel value (storing them in column major order)
+    let mut text = text;
+    for i in 0..rows {
+        for j in 0..cols {
+            let (txt, pixel_val) = nom::character::complete::digit1(text)?;
+            text = nom::character::complete::multispace0(txt)?.0;
+
+            let val = f32::from_str(pixel_val).unwrap();
+            image[j * rows + i] = val;
+        }
+    }
+
+    Ok((
+        text,
+        Image {
+            image,
+            max,
+            rows,
+            cols,
+        },
+    ))
+}
--- a/juno_samples/rodinia/srad/src/main.rs
+++ b/juno_samples/rodinia/srad/src/main.rs
+#![feature(concat_idents)]
+mod graphics;
+mod rust_srad;
+
+use graphics::*;
+
+use clap::Parser;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
+
+juno_build::juno!("srad");
+
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct SRADInputs {
+    niter: usize,
+    lambda: f32,
+    nrows: usize,
+    ncols: usize,
+    image: String,
+    #[clap(short, long)]
+    output: Option<String>,
+    #[clap(short, long)]
+    verify: bool,
+    #[clap(long = "output-verify", value_name = "PATH")]
+    output_verify: Option<String>,
+}
+
+fn srad_harness(args: SRADInputs) {
+    async_std::task::block_on(async {
+        let SRADInputs {
+            niter,
+            lambda,
+            nrows,
+            ncols,
+            image,
+            output,
+            verify,
+            output_verify,
+        } = args;
+
+        let Image {
+            image: image_ori,
+            max,
+            rows: image_ori_rows,
+            cols: image_ori_cols,
+        } = read_graphics(image);
+        let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
+        let mut image_h = HerculesMutBox::from(image.clone());
+
+        let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
+        let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
+        let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
+        let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
+
+        // Fix boundary conditions
+        iN[0] = 0;
+        iS[nrows - 1] = (nrows - 1) as i32;
+        jW[0] = 0;
+        jE[ncols - 1] = (ncols - 1) as i32;
+
+        let iN_h = HerculesImmBox::from(iN.as_slice());
+        let iS_h = HerculesImmBox::from(iS.as_slice());
+        let jW_h = HerculesImmBox::from(jW.as_slice());
+        let jE_h = HerculesImmBox::from(jE.as_slice());
+
+        let mut runner = runner!(srad);
+        let result: Vec<f32> = HerculesMutBox::from(
+            runner
+                .run(
+                    nrows as u64,
+                    ncols as u64,
+                    niter as u64,
+                    image_h.to(),
+                    iN_h.to(),
+                    iS_h.to(),
+                    jW_h.to(),
+                    jE_h.to(),
+                    max,
+                    lambda,
+                )
+                .await,
+        )
+        .as_slice()
+        .to_vec();
+
+        if let Some(output) = output {
+            write_graphics(output, &result, nrows, ncols, max);
+        }
+
+        if verify {
+            let mut rust_result = image;
+            rust_srad::srad(
+                nrows,
+                ncols,
+                niter,
+                &mut rust_result,
+                &iN,
+                &iS,
+                &jW,
+                &jE,
+                max,
+                lambda,
+            );
+
+            if let Some(output) = output_verify {
+                write_graphics(output, &rust_result, nrows, ncols, max);
+            }
+
+            let max_diff = result
+                .iter()
+                .zip(rust_result.iter())
+                .map(|(a, b)| (*a as i32 - *b as i32).abs())
+                .max()
+                .unwrap_or(0);
+            assert!(
+                max_diff <= 1,
+                "Verification failed: maximum pixel difference of {} exceeds threshold of 1",
+                max_diff
+            );
+        }
+    })
+}
+
+fn main() {
+    let args = SRADInputs::parse();
+    srad_harness(args);
+}
+
+#[test]
+fn srad_test() {
+    srad_harness(SRADInputs {
+        niter: 100,
+        lambda: 0.5,
+        nrows: 502,
+        ncols: 458,
+        image: "data/image.pgm".to_string(),
+        output: None,
+        verify: true,
+        output_verify: None,
+    });
+}
--- a/juno_samples/rodinia/srad/src/rust_srad.rs
+++ b/juno_samples/rodinia/srad/src/rust_srad.rs
+pub fn srad(
+    nrows: usize,
+    ncols: usize,
+    niter: usize,
+    image: &mut Vec<f32>,
+    iN: &[i32],
+    iS: &[i32],
+    jW: &[i32],
+    jE: &[i32],
+    max: f32,
+    lambda: f32,
+) {
+    let nelems = nrows * ncols;
+
+    // EXTRACT
+    for i in 0..nelems {
+        image[i] = (image[i] / max).exp();
+    }
+
+    let mut dN = vec![0.0; nelems];
+    let mut dS = vec![0.0; nelems];
+    let mut dW = vec![0.0; nelems];
+    let mut dE = vec![0.0; nelems];
+
+    let mut c = vec![0.0; nelems];
+
+    for _ in 0..niter {
+        let mut sum = 0.0;
+        let mut sum2 = 0.0;
+
+        for i in 0..nrows {
+            for j in 0..ncols {
+                let tmp = image[i + nrows * j];
+                sum += tmp;
+                sum2 += tmp * tmp;
+            }
+        }
+
+        let meanROI = sum / nelems as f32;
+        let varROI = (sum2 / nelems as f32) - meanROI * meanROI;
+        let q0sqr = varROI / (meanROI * meanROI);
+
+        for j in 0..ncols {
+            for i in 0..nrows {
+                let k = i + nrows * j;
+                let Jc = image[k];
+
+                dN[k] = image[iN[i] as usize + nrows * j] - Jc;
+                dS[k] = image[iS[i] as usize + nrows * j] - Jc;
+                dW[k] = image[i + nrows * jW[j] as usize] - Jc;
+                dE[k] = image[i + nrows * jE[j] as usize] - Jc;
+
+                let G2 =
+                    (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc);
+                let L = (dN[k] + dS[k] + dW[k] + dE[k]) / Jc;
+
+                let num = (0.5 * G2) - ((1.0 / 16.0) * (L * L));
+                let den = 1.0 + (0.25 * L);
+                let qsqr = num / (den * den);
+
+                let den = (qsqr - q0sqr) / (q0sqr * (1.0 + q0sqr));
+                c[k] = 1.0 / (1.0 + den);
+
+                if c[k] < 0.0 {
+                    c[k] = 0.0;
+                } else if c[k] > 1.0 {
+                    c[k] = 1.0;
+                }
+            }
+        }
+
+        for j in 0..ncols {
+            for i in 0..nrows {
+                let k = i + nrows * j;
+
+                let cN = c[k];
+                let cS = c[iS[i] as usize + nrows * j];
+                let cW = c[k];
+                let cE = c[i + nrows * jE[j] as usize];
+
+                let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];
+
+                image[k] = image[k] + 0.25 * lambda * D;
+            }
+        }
+    }
+
+    // COMPRESS
+    for i in 0..nelems {
+        image[i] = image[i].ln() * 255.0;
+    }
+}
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -141,6 +141,9 @@ impl FromStr for Appliable {
            "reduce-slf" => Ok(Appliable::Pass(ir::Pass::ReduceSLF)),
            "rename" => Ok(Appliable::Pass(ir::Pass::Rename)),
            "reuse-products" => Ok(Appliable::Pass(ir::Pass::ReuseProducts)),
+            "rewrite" | "rewrite-math" | "rewrite-math-expressions" => {
+                Ok(Appliable::Pass(ir::Pass::RewriteMathExpressions))
+            }
            "simplify-cfg" => Ok(Appliable::Pass(ir::Pass::SimplifyCFG)),
            "slf" | "store-load-forward" => Ok(Appliable::Pass(ir::Pass::SLF)),
            "sroa" => Ok(Appliable::Pass(ir::Pass::SROA)),

--- a/juno_scheduler/src/default.rs
+++ b/juno_scheduler/src/default.rs
@@ -49,7 +49,6 @@ pub fn default_schedule() -> ScheduleStmt {
        DCE,
        Inline,
        DeleteUncalled,
-        InterproceduralSROA,
        SROA,
        PhiElim,
        DCE,
@@ -96,7 +95,6 @@ pub fn default_schedule() -> ScheduleStmt {
        GVN,
        DCE,
        AutoOutline,
-        InterproceduralSROA,
        SROA,
        ReuseProducts,
        SimplifyCFG,

--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -34,6 +34,7 @@ pub enum Pass {
    ReduceSLF,
    Rename,
    ReuseProducts,
+    RewriteMathExpressions,
    SLF,
    SROA,
    Serialize,
@@ -54,6 +55,7 @@ impl Pass {
            Pass::ForkInterchange => num == 2,
            Pass::Print => num == 1,
            Pass::Rename => num == 1,
+            Pass::SROA => num == 0 || num == 1,
            Pass::Xdot => num == 0 || num == 1,
            _ => num == 0,
        }
@@ -67,6 +69,7 @@ impl Pass {
            Pass::ForkInterchange => "2",
            Pass::Print => "1",
            Pass::Rename => "1",
+            Pass::SROA => "0 or 1",
            Pass::Xdot => "0 or 1",
            _ => "0",
        }

--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
No results found