diff --git a/Cargo.lock b/Cargo.lock
index 61cde7f161b7c4177cb781addeb2f484af3b7477..acc9783042434d0569ed2fe7f1f577a43fc597d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,12 +23,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -684,27 +678,6 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
-[[package]]
-name = "egg"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb749745461743bb477fba3ef87c663d5965876155c676c9489cfe0963de5ab"
-dependencies = [
- "env_logger",
- "hashbrown",
- "indexmap",
- "log",
- "num-bigint",
- "num-traits",
- "quanta",
- "rustc-hash",
- "saturating",
- "smallvec",
- "symbol_table",
- "symbolic_expressions",
- "thiserror",
-]
-
 [[package]]
 name = "either"
 version = "1.13.0"
@@ -723,15 +696,6 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
 
-[[package]]
-name = "env_logger"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -845,12 +809,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foldhash"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
-
 [[package]]
 name = "funty"
 version = "2.0.0"
@@ -975,11 +933,6 @@ name = "hashbrown"
 version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
-]
 
 [[package]]
 name = "heapless"
@@ -1047,7 +1000,6 @@ version = "0.1.0"
 dependencies = [
  "bimap",
  "bitvec",
- "egg",
  "either",
  "hercules_cg",
  "hercules_ir",
@@ -2117,21 +2069,6 @@ dependencies = [
  "bytemuck",
 ]
 
-[[package]]
-name = "quanta"
-version = "0.12.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
-dependencies = [
- "crossbeam-utils",
- "libc",
- "once_cell",
- "raw-cpuid",
- "wasi 0.11.0+wasi-snapshot-preview1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "quick-error"
 version = "2.0.1"
@@ -2266,15 +2203,6 @@ dependencies = [
  "rgb",
 ]
 
-[[package]]
-name = "raw-cpuid"
-version = "11.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc"
-dependencies = [
- "bitflags 2.8.0",
-]
-
 [[package]]
 name = "rayon"
 version = "1.10.0"
@@ -2339,12 +2267,6 @@ version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
 
-[[package]]
-name = "rustc-hash"
-version = "2.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
-
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -2388,12 +2310,6 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "saturating"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71"
-
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -2537,23 +2453,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
-[[package]]
-name = "symbol_table"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f19bffd69fb182e684d14e3c71d04c0ef33d1641ac0b9e81c712c734e83703bc"
-dependencies = [
- "crossbeam-utils",
- "foldhash",
- "hashbrown",
-]
-
-[[package]]
-name = "symbolic_expressions"
-version = "5.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71"
-
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -2938,22 +2837,6 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
@@ -2963,12 +2846,6 @@ dependencies = [
  "windows-sys",
 ]
 
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
 [[package]]
 name = "windows"
 version = "0.59.0"
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 1e6067a360c6a57674efd43444abd300d74075b4..4069cb02bcf1ed0a93f5c40df432253e842ead9a 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
         write!(
             w,
             "
-#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
 #include <cuda_runtime.h>
 #include <math_constants.h>
 #include <mma.h>
+
+#if (CUDA_VERSION >= 12000)
+#else
+#define _CG_ABI_EXPERIMENTAL
+#endif
+
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+
+#if (CUDA_VERSION >= 12000)
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups;
+#else
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups::experimental;
+#endif
+
 #include <cuda_bf16.h>
 namespace cg = cooperative_groups;
 
@@ -562,9 +576,15 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
-        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
+        write!(
+            w,
+            "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
+        )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
+        write!(
+            w,
+            "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
+        )?;
         Ok(())
     }
 
@@ -1709,24 +1729,25 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
-                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
+                        write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?;
+                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?;
                         write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
                     }
                 }
diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml
index 92e0533938eab0cc76fe4b69a205f4dceae2f244..892e4c1539f34a91087472b052f015f542a351a6 100644
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -22,4 +22,4 @@ hercules_cg = { path = "../hercules_cg" }
 hercules_ir = { path = "../hercules_ir" }
 nestify = "*"
 bimap = "*"
-egg = "*"
+#egg = "*"
diff --git a/hercules_opt/src/rewrite_math_expressions.rs b/hercules_opt/src/rewrite_math_expressions.rs
index 6f52dc58ee32751b96190b42a0add01d42ce3b1e..55e695ba75704395c5fde371449fad940310ad43 100644
--- a/hercules_opt/src/rewrite_math_expressions.rs
+++ b/hercules_opt/src/rewrite_math_expressions.rs
@@ -3,10 +3,22 @@ use std::fmt::{Error, Write};
 
 use hercules_ir::*;
 
-use egg::*;
+//use egg::*;
 
 use crate::*;
 
+pub fn rewrite_math_expressions(
+    editor: &mut FunctionEditor,
+    device: Device,
+    typing: &Vec<TypeID>,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
+    reduce_einsums: &(MathEnv, HashMap<NodeID, MathID>),
+) {
+    panic!("PANIC: The rewrite math expressions pass is currently disabled, as including egg increases compile times and we're not using it currently.");
+}
+
+/*
 define_language! {
     enum MathLanguage {
         "zero" = Zero,
@@ -164,3 +176,4 @@ fn egg_print_math_expr<W: Write>(id: MathID, env: &MathEnv, w: &mut W) -> Result
         _ => Err(Error::default()),
     }
 }
+*/
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
index 306997f58eb217f9ce301dc18e418c412e6df621..6867576e4a3a81f327d23eca6dccbcd6a6183eb4 100644
--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
@@ -51,7 +51,9 @@ if feature("cuda") {
   fork-coalesce(*);
   infer-schedules(*);
   dce(*);
-  rewrite(*);
+  //rewrite(*);
+  let out = outline(matmul@outer);
+  gpu(out);
   fixpoint {
     simplify-cfg(*);
     dce(*);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index f8cc84a3206c97889e0151c86cc2549bd0b65c9b..1773f713ee3cdd223f5e2fa5c88ef2d4a837f2dc 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,11 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[32](layer_forward@inner_loop);
+fork-extend[32768](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[32, 0, false, true](layer_forward@inner_loop);
+fork-tile[32768, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 1, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index f87b4e09bc2c3707b1436ad21b2f8f8407033e52..ea8ba0ed3d8ac9c80b1a2b34529d70ac745868e2 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -13,41 +13,22 @@ fn bfs_bench(c: &mut Criterion) {
 
     let mut r = runner!(bfs);
 
-    group.bench_function("bfs bench 4096", |b| {
-        let input = "data/graph4096.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 65536", |b| {
-        let input = "data/graph65536.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 64M", |b| {
-        let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
+    let mut bench = |name, input: &'_ str| {
         let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
         let n = nodes.len() as u64;
         let m = edges.len() as u64;
         let nodes = HerculesImmBox::from(&nodes as &[Node]);
         let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
+        group.bench_function(name, |b| {
+            b.iter(|| {
+                async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
+            })
+        });
+    };
+
+    bench("bfs bench 4096", "data/graph4096.txt");
+    bench("bfs bench 65536", "data/graph65536.txt");
+    bench("bfs bench 64M", "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt");
 }
 
 criterion_group!(benches, bfs_bench);
diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index d6ec25f26b997205d4dcdf116bce462fa228fc88..75190300e4d8be463176c4c346d89715cece0ced 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -24,7 +24,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   @cost_init for i in 0..n {
     cost[i] = -1;
   }
-  cost[source as u64] = 0;
+  @cost_init cost[source as u64] = 0;
 
   // Nodes that were updated in the current iteration
   let updated: bool[n];
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index e51005613f9d14a3e16938b1a77d55aec96714bd..339782d6a84b6a43f31d81f5c9b23240dd10a292 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -46,16 +46,14 @@ let traverse_body = outline(inner);
 let (outer, inner) = fork-reshape[[1], [0]](collect);
 let collect_body = outline(inner);
 
-let init_body = init;
-// Following code seems to generate breaking RT code
-//fork-tile[32, 0, false, true](init);
-//let (outer, inner) = fork-reshape[[1], [0]](init);
-//let init_body = outline(inner);
-//inline(bfs@cost_init);
+fork-tile[32, 0, false, true](init);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
 
 inline(bfs@loop1, bfs@loop2);
 delete-uncalled(*);
 const-inline(*);
+simpl!(*);
 
 unforkify(init_body, traverse_body, collect_body);
 simpl!(*);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 4e5c1f74e34cedacd48ddf6c33b631a08079b995..0253a0210f6cc2451d38399601ad39ff3ab9465a 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -38,8 +38,12 @@ fixpoint {
 }
 simpl!(collect);
 
+fork-tile[32, 0, false, true](init);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
+
 fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
 
-unforkify(init);
+unforkify(init_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index aa6d772724238d13b6464398b07b67e287716085..5b4db044c3f66fc630cc434794e2a6c3de684682 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -13,38 +13,38 @@ fn cfd_bench(c: &mut Criterion) {
     group.sample_size(10);
 
     let mut euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(
@@ -91,38 +91,38 @@ fn cfd_bench(c: &mut Criterion) {
     );
 
     let mut pre_euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(pre_euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(pre_euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(