Compare revisions

902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc
--- a/juno_samples/rodinia/bfs/data/.gitignore
+++ b/juno_samples/rodinia/bfs/data/.gitignore
+!*.txt
--- a/juno_samples/rodinia/bfs/data/LICENSE
+++ b/juno_samples/rodinia/bfs/data/LICENSE
+LICENSE TERMS
+Copyright (c)2008-2011 University of Virginia
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+If you use this software or a modified version of it, please cite the most relevant among the following papers:
+- M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings 
+of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 
+Symposium on Computer Architecture (ISCA), June 2010.
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron.
+"Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium
+on Workload Characterization, Oct 2009.
+- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization
+for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International
+Conference on Supercomputing (ICS), June 2009.
+- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems
+Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International
+Symposium on Computer Architecture (ISCA), June 2009.
+- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA:
+A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel
+and Distributed Processing Symposium (IPDPS), May 2009.
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance
+Study of General Purpose Applications on Graphics Processors using CUDA" Journal of
+Parallel and Distributed Computing, Elsevier, June 2008.
--- a/juno_samples/rodinia/bfs/data/README.md
+++ b/juno_samples/rodinia/bfs/data/README.md
+BFS Examples from Rodinia Benchmark Suite 3.1
+The data provided herein are governed by the [LICENSE](./LICENSE).
--- a/juno_samples/rodinia/bfs/data/graph4096.txt
+++ b/juno_samples/rodinia/bfs/data/graph4096.txt
--- a/juno_samples/rodinia/bfs/data/graph65536.txt
+++ b/juno_samples/rodinia/bfs/data/graph65536.txt
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
+type Node = struct { edge_start: u32; num_edges: u32; };
+#[entry]
+fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] {
+  let stop = false;
+  // The mask selects the set of nodes that we consider in each iteration
+  // It includes only the nodes that were visited for the first time in the
+  // prior iteration (and in the first iteration just the source node)
+  let mask: bool[n];
+  mask[source as u64] = true;
+  let visited: bool[n];
+  visited[source as u64] = true;
+  let cost: i32[n];
+  for i in 0..n {
+    cost[i] = -1;
+  }
+  cost[source as u64] = 0;
+  // Nodes that were updated in the current iteration
+  let updated: bool[n];
+  while !stop {
+    stop = true;
+    for i in 0..n {
+      if mask[i] {
+        mask[i] = false;
+        let edge_start = graph_nodes[i].edge_start as u64;
+        let num_edges = graph_nodes[i].num_edges as u64;
+        for edge in edge_start..edge_start + num_edges {
+          let id = edges[edge] as u64;
+          if !visited[id] {
+            cost[id] = cost[i] + 1;
+            updated[id] = true;
+          }
+        }
+      }
+    }
+    for i in 0..n {
+      if updated[i] {
+        mask[i] = true;
+        visited[i] = true;
+        stop = false;
+        updated[i] = false;
+      }
+    }
+  }
+  return cost;
+}
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
+gvn(*);
+phi-elim(*);
+dce(*);
+let outline = auto-outline(bfs);
+gpu(outline.bfs);
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+//forkify(*);
+infer-schedules(*);
+gcm(*);
+fixpoint {
+  float-collections(*);
+  dce(*);
+  gcm(*);
+}
--- a/juno_samples/rodinia/bfs/src/graph_parser.rs
+++ b/juno_samples/rodinia/bfs/src/graph_parser.rs
+use std::fs::File;
+use std::io::Read;
+use std::str::FromStr;
+use nom::Parser;
+#[repr(C)]
+#[derive(Clone, Default)]
+pub struct Node {
+    pub edge_start: u32,
+    pub num_edges: u32,
+}
+pub fn parse_graph(file: String) -> (Vec<Node>, u32, Vec<u32>) {
+    let mut file = File::open(file).expect("Error opening input file");
+    let mut contents = String::new();
+    file.read_to_string(&mut contents)
+        .expect("Error reading input file");
+    let mut parser = nom::combinator::all_consuming(graph_parser);
+    let (_, result) = parser.parse(&contents).expect("Parser error");
+    result
+}
+fn graph_parser<'a>(text: &'a str) -> nom::IResult<&'a str, (Vec<Node>, u32, Vec<u32>)> {
+    // First, we find the number of nodes
+    let text = nom::character::complete::multispace0(text)?.0;
+    let (text, num_nodes) = nom::character::complete::digit1(text)?;
+    let num_nodes = u32::from_str(num_nodes).unwrap();
+    // Then, for each node there are two numbers: the index of that node's first edge and the
+    // number of edges that node has
+    let mut nodes = vec![];
+    let mut text = text;
+    for _ in 0..num_nodes {
+        let ntext = nom::character::complete::multispace0(text)?.0;
+        let (ntext, edge_start) = nom::character::complete::digit1(ntext)?;
+        let ntext = nom::character::complete::multispace0(ntext)?.0;
+        let (ntext, num_edges) = nom::character::complete::digit1(ntext)?;
+        let edge_start = u32::from_str(edge_start).unwrap();
+        let num_edges = u32::from_str(num_edges).unwrap();
+        nodes.push(Node {
+            edge_start,
+            num_edges,
+        });
+        text = ntext;
+    }
+    // Next, we find the source node
+    let text = nom::character::complete::multispace0(text)?.0;
+    let (text, source) = nom::character::complete::digit1(text)?;
+    let source = u32::from_str(source).unwrap();
+    // Next, the number of edges
+    let text = nom::character::complete::multispace0(text)?.0;
+    let (text, num_edges) = nom::character::complete::digit1(text)?;
+    let num_edges = u32::from_str(num_edges).unwrap();
+    // Finally, for each edge there are two numbers: the id (i.e. what the edge goes to) and the
+    // weight which is ignored (weighted BFS can't be parallelized in the same way, it would
+    // require synchronization)
+    let mut edges = vec![];
+    let mut text = text;
+    for _ in 0..num_edges {
+        let ntext = nom::character::complete::multispace0(text)?.0;
+        let (ntext, id) = nom::character::complete::digit1(ntext)?;
+        let ntext = nom::character::complete::multispace0(ntext)?.0;
+        let (ntext, _) = nom::character::complete::digit1(ntext)?;
+        let id = u32::from_str(id).unwrap();
+        edges.push(id);
+        text = ntext;
+    }
+    // Consume any remaining whitespace
+    let text = nom::character::complete::multispace0(text)?.0;
+    Ok((text, (nodes, source, edges)))
+}
--- a/juno_samples/rodinia/bfs/src/main.rs
+++ b/juno_samples/rodinia/bfs/src/main.rs
+#![feature(concat_idents)]
+mod graph_parser;
+mod rust_bfs;
+use graph_parser::*;
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
+use clap::Parser;
+juno_build::juno!("bfs");
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct BFSInputs {
+    input: String,
+}
+fn run_bfs(nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> {
+    let n = nodes.len() as u64;
+    let m = edges.len() as u64;
+    let nodes = HerculesImmBox::from(nodes);
+    let edges = HerculesImmBox::from(edges);
+    let mut runner = runner!(bfs);
+    HerculesMutBox::from(async_std::task::block_on(async {
+        runner.run(n, m, nodes.to(), source, edges.to()).await
+    }))
+    .as_slice()
+    .to_vec()
+}
+fn bfs_harness(args: BFSInputs) {
+    let BFSInputs { input } = args;
+    let (nodes, source, edges) = parse_graph(input);
+    let costs_juno = run_bfs(&nodes, source, &edges);
+    let costs_ref = rust_bfs::bfs(&nodes, source, &edges);
+    assert_eq!(costs_juno, costs_ref);
+}
+fn main() {
+    let args = BFSInputs::parse();
+    bfs_harness(args);
+}
+#[test]
+fn bfs_test_4096() {
+    bfs_harness(BFSInputs {
+        input: "data/graph4096.txt".to_string(),
+    });
+}
+#[test]
+fn bfs_test_65536() {
+    bfs_harness(BFSInputs {
+        input: "data/graph65536.txt".to_string(),
+    });
+}
--- a/juno_samples/rodinia/bfs/src/rust_bfs.rs
+++ b/juno_samples/rodinia/bfs/src/rust_bfs.rs
+use crate::graph_parser::Node;
+use std::collections::VecDeque;
+pub fn bfs(graph_nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> {
+    let mut explored = vec![false; graph_nodes.len()];
+    let mut costs = vec![-1; graph_nodes.len()];
+    let mut worklist = VecDeque::new();
+    let source = source as usize;
+    explored[source] = true;
+    costs[source] = 0;
+    worklist.push_back(source);
+    while let Some(node) = worklist.pop_front() {
+        let edge_start = graph_nodes[node].edge_start;
+        let num_edges = graph_nodes[node].num_edges;
+        for edge in edge_start..edge_start + num_edges {
+            let dst = edges[edge as usize] as usize;
+            if !explored[dst] {
+                explored[dst] = true;
+                costs[dst] = costs[node] + 1;
+                worklist.push_back(dst as usize);
+            }
+        }
+    }
+    costs
+}
--- a/juno_samples/rodinia/cfd/Cargo.toml
+++ b/juno_samples/rodinia/cfd/Cargo.toml
+[package]
+name = "juno_cfd"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+[[bin]]
+name = "juno_cfd"
+path = "src/main.rs"
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+[build-dependencies]
+juno_build = { path = "../../../juno_build" }
+[dependencies]
+juno_build = { path = "../../../juno_build" }
+hercules_rt = { path = "../../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+nom = "*"
--- a/juno_samples/rodinia/cfd/build.rs
+++ b/juno_samples/rodinia/cfd/build.rs
+use juno_build::JunoCompiler;
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("euler.jn")
+        .unwrap()
+        .schedule_in_src("gpu_euler.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("euler.jn")
+        .unwrap()
+        .schedule_in_src("cpu_euler.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("pre_euler.jn")
+        .unwrap()
+        .schedule_in_src("gpu_pre_euler.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("pre_euler.jn")
+        .unwrap()
+        .schedule_in_src("cpu_pre_euler.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+}
--- a/juno_samples/rodinia/cfd/data/.gitignore
+++ b/juno_samples/rodinia/cfd/data/.gitignore
+!*.txt
--- a/juno_samples/rodinia/cfd/data/LICENSE
+++ b/juno_samples/rodinia/cfd/data/LICENSE
+LICENSE TERMS
+Copyright (c)2008-2011 University of Virginia
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted without royalty fees or other restrictions, provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of Virginia, the Dept. of Computer Science, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF VIRGINIA OR THE SOFTWARE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+If you use this software or a modified version of it, please cite the most relevant among the following papers:
+- M. A. Goodrum, M. J. Trotter, A. Aksel, S. T. Acton, and K. Skadron. Parallelization of Particle Filter Algorithms. In Proceedings 
+of the 3rd Workshop on Emerging Applications and Many-core Architecture (EAMA), in conjunction with the IEEE/ACM International 
+Symposium on Computer Architecture (ISCA), June 2010.
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, Sang-Ha Lee and K. Skadron.
+"Rodinia: A Benchmark Suite for Heterogeneous Computing". IEEE International Symposium
+on Workload Characterization, Oct 2009.
+- J. Meng and K. Skadron. "Performance Modeling and Automatic Ghost Zone Optimization
+for Iterative Stencil Loops on GPUs." In Proceedings of the 23rd Annual ACM International
+Conference on Supercomputing (ICS), June 2009.
+- L.G. Szafaryn, K. Skadron and J. Saucerman. "Experiences Accelerating MATLAB Systems
+Biology Applications." in Workshop on Biomedicine in Computing (BiC) at the International
+Symposium on Computer Architecture (ISCA), June 2009.
+- M. Boyer, D. Tarjan, S. T. Acton, and K. Skadron. "Accelerating Leukocyte Tracking using CUDA:
+A Case Study in Leveraging Manycore Coprocessors." In Proceedings of the International Parallel
+and Distributed Processing Symposium (IPDPS), May 2009.
+- S. Che, M. Boyer, J. Meng, D. Tarjan, J. W. Sheaffer, and K. Skadron. "A Performance
+Study of General Purpose Applications on Graphics Processors using CUDA" Journal of
+Parallel and Distributed Computing, Elsevier, June 2008.
--- a/juno_samples/rodinia/cfd/data/README.md
+++ b/juno_samples/rodinia/cfd/data/README.md
+CFD Examples from Rodinia Benchmark Suite 3.1
+The data provided herein are governed by the [LICENSE](./LICENSE).
--- a/juno_samples/rodinia/cfd/data/fvcorr.domn.097K
+++ b/juno_samples/rodinia/cfd/data/fvcorr.domn.097K
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+let auto = auto-outline(euler);
+cpu(auto.euler);
+inline(auto.euler);
+inline(auto.euler);
+delete-uncalled(*);
+sroa[false](auto.euler);
+dce(*);
+float-collections(*);
+dce(*);
+gcm(*);
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+let auto = auto-outline(pre_euler);
+cpu(auto.pre_euler);
+inline(auto.pre_euler);
+inline(auto.pre_euler);
+delete-uncalled(*);
+sroa[false](auto.pre_euler);
+dce(*);
+float-collections(*);
+dce(*);
+gcm(*);
--- a/juno_samples/rodinia/cfd/src/euler.jn
+++ b/juno_samples/rodinia/cfd/src/euler.jn
+const NNB : usize = 4;
+type Normals<nelr: usize> = struct {
+  x: f32[NNB, nelr],
+  y: f32[NNB, nelr],
+  z: f32[NNB, nelr],
+};
+type Momentum<nelr: usize> = struct {
+  x: f32[nelr],
+  y: f32[nelr],
+  z: f32[nelr],
+};
+type Variables<nelr: usize> = struct {
+  density: f32[nelr],
+  momentum: Momentum::<nelr>,
+  energy: f32[nelr],
+};
+type float3 = struct { x: f32, y: f32, z: f32 };
+type Variable = struct {
+  density: f32,
+  momentum: float3,
+  energy: f32,
+};
+fn compute_velocity(density: f32, momentum: float3) -> float3 {
+  return float3 { x: momentum.x / density,
+                  y: momentum.y / density,
+                  z: momentum.z / density };
+}
+fn compute_speed_sqd(velocity: float3) -> f32 {
+  return velocity.x * velocity.x + velocity.y * velocity.y + velocity.z * velocity.z;
+}
+const GAMMA : f32 = 1.4;
+fn compute_pressure(density: f32, density_energy: f32, speed_sqd: f32) -> f32 {
+  return (GAMMA - 1.0) * (density_energy - 0.5 * density * speed_sqd);
+}
+fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
+  return sqrt!(GAMMA * pressure / density);
+}
+fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
+  let step_factors : f32[nelr];
+  for i in 0..nelr {
+    let density = variables.density[i];
+    let momentum : float3;
+    momentum.x = variables.momentum.x[i];
+    momentum.y = variables.momentum.y[i];
+    momentum.z = variables.momentum.z[i];
+    let density_energy = variables.energy[i];
+    let velocity       = compute_velocity(density, momentum);
+    let speed_sqd      = compute_speed_sqd(velocity);
+    let pressure       = compute_pressure(density, density_energy, speed_sqd);
+    let speed_of_sound = compute_speed_of_sound(density, pressure);
+    step_factors[i] = 0.5 / (sqrt!(areas[i]) * (sqrt!(speed_sqd) + speed_of_sound));
+  }
+  return step_factors;
+}
+fn compute_flux_contribution(
+  density: f32,
+  momentum: float3,
+  density_energy: f32,
+  pressure: f32,
+  velocity: float3,
+) -> (float3, float3, float3, float3) {
+  let fc_momentum_x = float3 { x: velocity.x * momentum.x + pressure,
+                               y: velocity.x * momentum.y,
+                               z: velocity.x * momentum.z };
+  let fc_momentum_y = float3 { x: fc_momentum_x.y,
+                               y: velocity.y * momentum.y + pressure,
+                               z: velocity.y * momentum.z };
+  let fc_momentum_z = float3 { x: fc_momentum_x.z,
+                               y: fc_momentum_y.z,
+                               z: velocity.z * momentum.z + pressure };
+  let de_p = density_energy + pressure;
+  let fc_density_energy = float3 { x: velocity.x * de_p,
+                                   y: velocity.y * de_p,
+                                   z: velocity.z * de_p };
+  return (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy);
+}
+fn compute_flux<nelr: usize>(
+  variables: Variables::<nelr>,
+  elements_surrounding_elements: i32[NNB, nelr],
+  normals: Normals::<nelr>,
+  ff_variable: Variable,
+  ff_flux_contribution_density_energy: float3,
+  ff_flux_contribution_momentum_x: float3,
+  ff_flux_contribution_momentum_y: float3,
+  ff_flux_contribution_momentum_z: float3,
+) -> Variables::<nelr> {
+  const smoothing_coefficient : f32 = 0.2;
+  let fluxes: Variables::<nelr>;
+  for i in 0..nelr {
+    let density_i = variables.density[i];
+    let momentum_i = float3 { x: variables.momentum.x[i],
+                              y: variables.momentum.y[i],
+                              z: variables.momentum.z[i] };
+    let density_energy_i = variables.energy[i];
+    let velocity_i       = compute_velocity(density_i, momentum_i);
+    let speed_sqd_i      = compute_speed_sqd(velocity_i);
+    let speed_i          = sqrt!(speed_sqd_i);
+    let pressure_i       = compute_pressure(density_i, density_energy_i, speed_sqd_i);
+    let speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
+    let (flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
+         flux_contribution_i_momentum_z, flux_contribution_i_density_energy)
+      = compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i);
+    let flux_i_density : f32 = 0;
+    let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
+    let flux_i_density_energy : f32 = 0.0;
+    for j in 0..NNB {
+      let nb = elements_surrounding_elements[j, i];
+      let normal = float3 {
+        x: normals.x[j, i],
+        y: normals.y[j, i],
+        z: normals.z[j, i],
+      };
+      let normal_len = sqrt!(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
+      if nb >= 0 { // a legitimate neighbor
+        let nb = nb as usize;
+        let density_nb = variables.density[nb];
+        let momentum_nb = float3 {
+          x: variables.momentum.x[nb],
+          y: variables.momentum.y[nb],
+          z: variables.momentum.z[nb],
+        };
+        let density_energy_nb = variables.energy[nb];
+        let velocity_nb       = compute_velocity(density_nb, momentum_nb);
+        let speed_sqd_nb      = compute_speed_sqd(velocity_nb);
+        let pressure_nb       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
+        let speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
+        let (flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
+             flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy)
+          = compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb);
+        // artificial viscosity
+        let factor = -normal_len * smoothing_coefficient * 0.5
+                   * (speed_i + sqrt!(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
+        flux_i_density += factor * (density_i - density_nb);
+        flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
+        flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
+        flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
+        flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
+        // accumulate cell-centered fluxes
+        let factor = 0.5 * normal.x;
+        flux_i_density += factor * (momentum_nb.x + momentum_i.x);
+        flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x + flux_contribution_i_density_energy.x);
+        flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x + flux_contribution_i_momentum_x.x);
+        flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x + flux_contribution_i_momentum_y.x);
+        flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x + flux_contribution_i_momentum_z.x);
+        let factor = 0.5 * normal.y;
+        flux_i_density += factor * (momentum_nb.y + momentum_i.y);
+        flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y + flux_contribution_i_density_energy.y);
+        flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y + flux_contribution_i_momentum_x.y);
+        flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y + flux_contribution_i_momentum_y.y);
+        flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y + flux_contribution_i_momentum_z.y);
+        let factor = 0.5 * normal.z;
+        flux_i_density += factor * (momentum_nb.z + momentum_i.z);
+        flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z + flux_contribution_i_density_energy.z);
+        flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z + flux_contribution_i_momentum_x.z);
+        flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z + flux_contribution_i_momentum_y.z);
+        flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z + flux_contribution_i_momentum_z.z);
+      } else if nb == -1 { // a wing boundary
+        flux_i_momentum.x += normal.x * pressure_i;
+        flux_i_momentum.y += normal.y * pressure_i;
+        flux_i_momentum.z += normal.z * pressure_i;
+      } else if nb == -2 { // a far field boundary
+        let factor = 0.5 * normal.x;
+        flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x);
+        flux_i_density_energy += factor * (ff_flux_contribution_density_energy.x + flux_contribution_i_density_energy.x);
+        flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x.x + flux_contribution_i_momentum_x.x);
+        flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y.x + flux_contribution_i_momentum_y.x);
+        flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z.x + flux_contribution_i_momentum_z.x);
+        let factor = 0.5 * normal.y;
+        flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y);
+        flux_i_density_energy += factor * (ff_flux_contribution_density_energy.y + flux_contribution_i_density_energy.y);
+        flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x.y + flux_contribution_i_momentum_x.y);
+        flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y.y + flux_contribution_i_momentum_y.y);
+        flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z.y + flux_contribution_i_momentum_z.y);
+        let factor = 0.5 * normal.z;
+        flux_i_density += factor * (ff_variable.momentum.y + momentum_i.z);
+        flux_i_density_energy += factor * (ff_flux_contribution_density_energy.z + flux_contribution_i_density_energy.z);
+        flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x.z + flux_contribution_i_momentum_x.z);
+        flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y.z + flux_contribution_i_momentum_y.z);
+        flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z.z + flux_contribution_i_momentum_z.z);
+      }
+    }
+    fluxes.density[i] = flux_i_density;
+    fluxes.momentum.x[i] = flux_i_momentum.x;
+    fluxes.momentum.y[i] = flux_i_momentum.y;
+    fluxes.momentum.z[i] = flux_i_momentum.z;
+    fluxes.energy[i] = flux_i_density_energy;
+  }
+  return fluxes;
+}
+const RK : usize = 3;
+fn time_step<nelr: usize>(
+  j: usize,
+  old_variables: Variables::<nelr>,
+  variables: Variables::<nelr>,
+  step_factors: f32[nelr],
+  fluxes: Variables::<nelr>,
+) -> Variables::<nelr> {
+  for i in 0..nelr {
+    let factor = step_factors[i] / (RK + 1 - j) as f32;
+    variables.density[i]    = old_variables.density[i]    + factor * fluxes.density[i];
+    variables.momentum.x[i] = old_variables.momentum.x[i] + factor * fluxes.momentum.x[i];
+    variables.momentum.y[i] = old_variables.momentum.y[i] + factor * fluxes.momentum.y[i];
+    variables.momentum.z[i] = old_variables.momentum.z[i] + factor * fluxes.momentum.z[i];
+    variables.energy[i]     = old_variables.energy[i]     + factor * fluxes.energy[i];
+  }
+  return variables;
+}
+fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
+  let result : Variables::<nelr>;
+  for i in 0..nelr {
+    result.density[i] = variables.density[i];
+    result.momentum.x[i] = variables.momentum.x[i];
+    result.momentum.y[i] = variables.momentum.y[i];
+    result.momentum.z[i] = variables.momentum.z[i];
+    result.energy[i] = variables.energy[i];
+  }
+  return result;
+}
+#[entry]
+fn euler<nelr: usize>(
+  iterations: usize,
+  variables: Variables::<nelr>,
+  areas: f32[nelr],
+  elements_surrounding_elements: i32[NNB, nelr],
+  normals: Normals::<nelr>,
+  ff_variable: Variable,
+  ff_flux_contribution_density_energy: float3,
+  ff_flux_contribution_momentum_x: float3,
+  ff_flux_contribution_momentum_y: float3,
+  ff_flux_contribution_momentum_z: float3,
+) -> Variables::<nelr> {
+  for i in 0..iterations {
+    let old_variables = copy_vars::<nelr>(variables);
+    let step_factors = compute_step_factor::<nelr>(variables, areas);
+    for j in 0..RK {
+      let fluxes = compute_flux::<nelr>(variables, elements_surrounding_elements,
+                                        normals, ff_variable, ff_flux_contribution_density_energy,
+                                        ff_flux_contribution_momentum_x,
+                                        ff_flux_contribution_momentum_y,
+                                        ff_flux_contribution_momentum_z);
+      variables = time_step::<nelr>(j, old_variables, variables, step_factors, fluxes);
+    }
+  }
+  return variables;
+}
--- a/juno_samples/rodinia/cfd/src/gpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+let auto = auto-outline(euler);
+gpu(auto.euler);
+inline(auto.euler);
+inline(auto.euler);
+delete-uncalled(*);
+sroa[false](auto.euler);
+dce(*);
+float-collections(*);
+dce(*);
+gcm(*);
No results found