Skip to content
Snippets Groups Projects
Commit 1eebcaa3 authored by rarbore2's avatar rarbore2
Browse files

More optimization

parent 0717de36
No related branches found
No related tags found
1 merge request!212More optimization
Showing with 120 additions and 99 deletions
...@@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) { ...@@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) {
} }
MathExpr::IntrinsicFunc(intrinsic, ref args) => { MathExpr::IntrinsicFunc(intrinsic, ref args) => {
print!("{}(", intrinsic.lower_case_name()); print!("{}(", intrinsic.lower_case_name());
debug_print_math_expr(id, env);
for arg in args { for arg in args {
print!(", ");
debug_print_math_expr(*arg, env); debug_print_math_expr(*arg, env);
print!(", ");
} }
print!(")"); print!(")");
} }
......
...@@ -822,7 +822,7 @@ fn typeflow( ...@@ -822,7 +822,7 @@ fn typeflow(
// We also return the return type from here // We also return the return type from here
match intrinsic { match intrinsic {
// Intrinsics that take any numeric type and return the same // Intrinsics that take any numeric type and return the same
Intrinsic::Abs => { Intrinsic::Abs | Intrinsic::Max | Intrinsic::Min => {
if let Concrete(id) = inputs[0] { if let Concrete(id) = inputs[0] {
if types[id.idx()].is_arithmetic() { if types[id.idx()].is_arithmetic() {
Concrete(*id) Concrete(*id)
...@@ -856,8 +856,6 @@ fn typeflow( ...@@ -856,8 +856,6 @@ fn typeflow(
| Intrinsic::Ln1P | Intrinsic::Ln1P
| Intrinsic::Log10 | Intrinsic::Log10
| Intrinsic::Log2 | Intrinsic::Log2
| Intrinsic::Max
| Intrinsic::Min
| Intrinsic::Round | Intrinsic::Round
| Intrinsic::Sin | Intrinsic::Sin
| Intrinsic::Sinh | Intrinsic::Sinh
......
...@@ -3,7 +3,7 @@ gvn(*); ...@@ -3,7 +3,7 @@ gvn(*);
phi-elim(*); phi-elim(*);
dce(*); dce(*);
let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9); let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
cpu(auto.test1); cpu(auto.test1);
cpu(auto.test2); cpu(auto.test2);
cpu(auto.test3); cpu(auto.test3);
...@@ -12,6 +12,7 @@ cpu(auto.test5); ...@@ -12,6 +12,7 @@ cpu(auto.test5);
cpu(auto.test7); cpu(auto.test7);
cpu(auto.test8); cpu(auto.test8);
cpu(auto.test9); cpu(auto.test9);
cpu(auto.test10);
let test1_cpu = auto.test1; let test1_cpu = auto.test1;
rename["test1_cpu"](test1_cpu); rename["test1_cpu"](test1_cpu);
...@@ -94,6 +95,11 @@ dce(auto.test8); ...@@ -94,6 +95,11 @@ dce(auto.test8);
simplify-cfg(auto.test8); simplify-cfg(auto.test8);
dce(auto.test8); dce(auto.test8);
no-memset(test9@const); array-slf(auto.test10);
ccp(auto.test10);
dce(auto.test10);
simplify-cfg(auto.test10);
dce(auto.test10);
unforkify(auto.test10);
gcm(*); gcm(*);
...@@ -147,3 +147,16 @@ fn test9<r, c : usize>(input : i32[r, c]) -> i32[r, c] { ...@@ -147,3 +147,16 @@ fn test9<r, c : usize>(input : i32[r, c]) -> i32[r, c] {
return out; return out;
} }
#[entry]
fn test10(k1 : i32[8], k2 : i32[8], v : i32[8]) -> i32 {
@const let s : i32[8];
for i = 0 to 8 {
s[i] = v[k1[i] as u64];
}
let sum = 0;
for i = 0 to 8 {
sum += s[k2[i] as u64];
}
return sum;
}
\ No newline at end of file
...@@ -8,12 +8,13 @@ no-memset(test6@const); ...@@ -8,12 +8,13 @@ no-memset(test6@const);
no-memset(test8@const1); no-memset(test8@const1);
no-memset(test8@const2); no-memset(test8@const2);
no-memset(test9@const); no-memset(test9@const);
no-memset(test10@const);
gvn(*); gvn(*);
phi-elim(*); phi-elim(*);
dce(*); dce(*);
let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9); let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
gpu(auto.test1); gpu(auto.test1);
gpu(auto.test2); gpu(auto.test2);
gpu(auto.test3); gpu(auto.test3);
...@@ -22,6 +23,7 @@ gpu(auto.test5); ...@@ -22,6 +23,7 @@ gpu(auto.test5);
gpu(auto.test7); gpu(auto.test7);
gpu(auto.test8); gpu(auto.test8);
gpu(auto.test9); gpu(auto.test9);
gpu(auto.test10);
ip-sroa(*); ip-sroa(*);
sroa(*); sroa(*);
......
...@@ -74,6 +74,20 @@ fn main() { ...@@ -74,6 +74,20 @@ fn main() {
5 + 6 + 8 + 9, 5 + 6 + 8 + 9,
]; ];
assert(&correct, output); assert(&correct, output);
let mut r = runner!(test10);
let k1 = vec![0, 4, 3, 7, 3, 4, 2, 1];
let k2 = vec![6, 4, 3, 2, 4, 1, 0, 5];
let v = vec![3, -499, 4, 32, -2, 55, -74, 10];
let mut correct = 0;
for i in 0..8 {
correct += v[k1[k2[i] as usize] as usize];
}
let k1 = HerculesImmBox::from(&k1 as &[i32]);
let k2 = HerculesImmBox::from(&k2 as &[i32]);
let v = HerculesImmBox::from(&v as &[i32]);
let output = r.run(k1.to(), k2.to(), v.to()).await;
assert_eq!(output, correct);
}); });
} }
......
...@@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) { ...@@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) {
} = read_graphics(image); } = read_graphics(image);
let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
let mut image_h = HerculesMutBox::from(image.clone()); let mut image_h = HerculesMutBox::from(image.clone());
let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
// Fix boundary conditions
iN[0] = 0;
iS[nrows - 1] = (nrows - 1) as i32;
jW[0] = 0;
jE[ncols - 1] = (ncols - 1) as i32;
let iN_h = HerculesImmBox::from(iN.as_slice());
let iS_h = HerculesImmBox::from(iS.as_slice());
let jW_h = HerculesImmBox::from(jW.as_slice());
let jE_h = HerculesImmBox::from(jE.as_slice());
group.bench_function("srad bench", |b| { group.bench_function("srad bench", |b| {
b.iter(|| { b.iter(|| {
async_std::task::block_on(async { async_std::task::block_on(async {
...@@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) { ...@@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) {
ncols as u64, ncols as u64,
niter as u64, niter as u64,
image_h.to(), image_h.to(),
iN_h.to(),
iS_h.to(),
jW_h.to(),
jE_h.to(),
max, max,
lambda, lambda,
) )
......
...@@ -8,6 +8,7 @@ macro simpl!(X) { ...@@ -8,6 +8,7 @@ macro simpl!(X) {
infer-schedules(X); infer-schedules(X);
} }
no-memset(srad@scratch);
phi-elim(*); phi-elim(*);
let loop1 = outline(srad@loop1); let loop1 = outline(srad@loop1);
let loop2 = outline(srad@loop2); let loop2 = outline(srad@loop2);
...@@ -31,8 +32,18 @@ simpl!(*); ...@@ -31,8 +32,18 @@ simpl!(*);
fork-interchange[0, 1](loop1); fork-interchange[0, 1](loop1);
reduce-slf(*); reduce-slf(*);
simpl!(*); simpl!(*);
slf(*);
simpl!(*);
fork-tile[32, 0, false, false](loop2);
let split = fork-split(loop2);
let loop2_body = outline(split.srad_1.fj1);
simpl!(loop2, loop2_body);
inline(srad@loop2);
delete-uncalled(*);
fork-split(*); fork-split(extract, compress, loop1, loop2_body, loop3);
unforkify(*); unforkify(extract, compress, loop1, loop2_body, loop3);
gcm(*); gcm(*);
...@@ -8,6 +8,7 @@ macro simpl!(X) { ...@@ -8,6 +8,7 @@ macro simpl!(X) {
infer-schedules(X); infer-schedules(X);
} }
no-memset(srad@scratch);
phi-elim(*); phi-elim(*);
let sum_loop = outline(srad@loop1); let sum_loop = outline(srad@loop1);
let main_loops = outline(srad@loop2 | srad@loop3); let main_loops = outline(srad@loop2 | srad@loop3);
...@@ -41,15 +42,26 @@ fork-tile[32, 0, false, true](sum_loop); ...@@ -41,15 +42,26 @@ fork-tile[32, 0, false, true](sum_loop);
let out = fork-split(sum_loop); let out = fork-split(sum_loop);
clean-monoid-reduces(sum_loop); clean-monoid-reduces(sum_loop);
simpl!(sum_loop); simpl!(sum_loop);
let fission = fork-fission[out.srad_0.fj0](sum_loop);
let fission1 = fork-fission[out.srad_0.fj0](sum_loop);
simpl!(sum_loop);
fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom);
let out = fork-split(fission1.srad_0.fj_bottom);
clean-monoid-reduces(sum_loop);
simpl!(sum_loop);
let fission2 = fork-fission[out.srad_0.fj0](sum_loop);
simpl!(sum_loop); simpl!(sum_loop);
fork-tile[32, 0, false, true](fission.srad_0.fj_bottom); fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom);
let out = fork-split(fission.srad_0.fj_bottom); let out = fork-split(fission2.srad_0.fj_bottom);
clean-monoid-reduces(sum_loop); clean-monoid-reduces(sum_loop);
simpl!(sum_loop); simpl!(sum_loop);
let top = outline(fission.srad_0.fj_top);
let bottom = outline(out.srad_0.fj0); let first = outline(fission1.srad_0.fj_top);
gpu(top, bottom); let second = outline(fission2.srad_0.fj_top);
let third = outline(out.srad_0.fj0);
gpu(first, second, third);
const-inline[false](*);
ip-sroa(*); ip-sroa(*);
sroa(*); sroa(*);
simpl!(*); simpl!(*);
...@@ -60,4 +72,16 @@ dce(main_loops); ...@@ -60,4 +72,16 @@ dce(main_loops);
fork-split(main_loops); fork-split(main_loops);
simpl!(main_loops); simpl!(main_loops);
fork-dim-merge(extract);
fork-tile[32, 0, false, true](extract);
dce(extract);
fork-split(extract);
simpl!(extract);
fork-dim-merge(compress);
fork-tile[32, 0, false, true](compress);
dce(compress);
fork-split(compress);
simpl!(compress);
gcm(*); gcm(*);
...@@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) { ...@@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) {
let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
let mut image_h = HerculesMutBox::from(image.clone()); let mut image_h = HerculesMutBox::from(image.clone());
let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
// Fix boundary conditions
iN[0] = 0;
iS[nrows - 1] = (nrows - 1) as i32;
jW[0] = 0;
jE[ncols - 1] = (ncols - 1) as i32;
let iN_h = HerculesImmBox::from(iN.as_slice());
let iS_h = HerculesImmBox::from(iS.as_slice());
let jW_h = HerculesImmBox::from(jW.as_slice());
let jE_h = HerculesImmBox::from(jE.as_slice());
let mut runner = runner!(srad); let mut runner = runner!(srad);
let result: Vec<f32> = HerculesMutBox::from( let result: Vec<f32> = HerculesMutBox::from(
runner runner
...@@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) { ...@@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) {
ncols as u64, ncols as u64,
niter as u64, niter as u64,
image_h.to(), image_h.to(),
iN_h.to(),
iS_h.to(),
jW_h.to(),
jE_h.to(),
max, max,
lambda, lambda,
) )
...@@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) { ...@@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) {
if verify { if verify {
let mut rust_result = image; let mut rust_result = image;
rust_srad::srad( rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda);
nrows,
ncols,
niter,
&mut rust_result,
&iN,
&iS,
&jW,
&jE,
max,
lambda,
);
if let Some(output) = output_verify { if let Some(output) = output_verify {
write_graphics(output, &rust_result, nrows, ncols, max); write_graphics(output, &rust_result, nrows, ncols, max);
......
pub fn srad( pub fn srad(nrows: usize, ncols: usize, niter: usize, image: &mut Vec<f32>, max: f32, lambda: f32) {
nrows: usize,
ncols: usize,
niter: usize,
image: &mut Vec<f32>,
iN: &[i32],
iS: &[i32],
jW: &[i32],
jE: &[i32],
max: f32,
lambda: f32,
) {
let nelems = nrows * ncols; let nelems = nrows * ncols;
// EXTRACT // EXTRACT
...@@ -44,11 +33,15 @@ pub fn srad( ...@@ -44,11 +33,15 @@ pub fn srad(
for i in 0..nrows { for i in 0..nrows {
let k = i + nrows * j; let k = i + nrows * j;
let Jc = image[k]; let Jc = image[k];
let iN = std::cmp::max(i, 1) - 1;
let iS = std::cmp::min(i, nrows - 2) + 1;
let jW = std::cmp::max(j, 1) - 1;
let jE = std::cmp::min(j, ncols - 2) + 1;
dN[k] = image[iN[i] as usize + nrows * j] - Jc; dN[k] = image[iN as usize + nrows * j] - Jc;
dS[k] = image[iS[i] as usize + nrows * j] - Jc; dS[k] = image[iS as usize + nrows * j] - Jc;
dW[k] = image[i + nrows * jW[j] as usize] - Jc; dW[k] = image[i + nrows * jW as usize] - Jc;
dE[k] = image[i + nrows * jE[j] as usize] - Jc; dE[k] = image[i + nrows * jE as usize] - Jc;
let G2 = let G2 =
(dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc); (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc);
...@@ -72,11 +65,13 @@ pub fn srad( ...@@ -72,11 +65,13 @@ pub fn srad(
for j in 0..ncols { for j in 0..ncols {
for i in 0..nrows { for i in 0..nrows {
let k = i + nrows * j; let k = i + nrows * j;
let iS = std::cmp::min(i, nrows - 2) + 1;
let jE = std::cmp::min(j, ncols - 2) + 1;
let cN = c[k]; let cN = c[k];
let cS = c[iS[i] as usize + nrows * j]; let cS = c[iS as usize + nrows * j];
let cW = c[k]; let cW = c[k];
let cE = c[i + nrows * jE[j] as usize]; let cE = c[i + nrows * jE as usize];
let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k]; let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];
......
...@@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) { ...@@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) {
fn srad<nrows, ncols: usize>( fn srad<nrows, ncols: usize>(
niter: usize, niter: usize,
inout image: f32[ncols, nrows], inout image: f32[ncols, nrows],
iN: i32[nrows],
iS: i32[nrows],
jW: i32[ncols],
jE: i32[ncols],
max: f32, max: f32,
lambda: f32, lambda: f32,
) { ) {
...@@ -50,20 +46,25 @@ fn srad<nrows, ncols: usize>( ...@@ -50,20 +46,25 @@ fn srad<nrows, ncols: usize>(
let varROI = (sum2 / nelems as f32) - meanROI * meanROI; let varROI = (sum2 / nelems as f32) - meanROI * meanROI;
let q0sqr = varROI / (meanROI * meanROI); let q0sqr = varROI / (meanROI * meanROI);
@dirs let dN : f32[ncols, nrows]; @scratch let dN : f32[ncols, nrows];
@dirs let dS : f32[ncols, nrows]; @scratch let dS : f32[ncols, nrows];
@dirs let dE : f32[ncols, nrows]; @scratch let dE : f32[ncols, nrows];
@dirs let dW : f32[ncols, nrows]; @scratch let dW : f32[ncols, nrows];
let c : f32[ncols, nrows]; @scratch let c : f32[ncols, nrows];
@loop2 for j in 0..ncols { @loop2 for j in 0..ncols {
for i in 0..nrows { for i in 0..nrows {
let Jc = image[j, i]; let Jc = image[j, i];
dN[j, i] = image[j, iN[i] as u64] - Jc; let iN = max!(i, 1) - 1;
dS[j, i] = image[j, iS[i] as u64] - Jc; let iS = min!(i, nrows - 2) + 1;
dW[j, i] = image[jW[j] as u64, i] - Jc; let jW = max!(j, 1) - 1;
dE[j, i] = image[jE[j] as u64, i] - Jc; let jE = min!(j, ncols - 2) + 1;
dN[j, i] = image[j, iN as u64] - Jc;
dS[j, i] = image[j, iS as u64] - Jc;
dW[j, i] = image[jW as u64, i] - Jc;
dE[j, i] = image[jE as u64, i] - Jc;
let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i] let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i]
+ dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc); + dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc);
...@@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>( ...@@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>(
@loop3 for j in 0..ncols { @loop3 for j in 0..ncols {
for i in 0..nrows { for i in 0..nrows {
let iS = min!(i, nrows - 2) + 1;
let jE = min!(j, ncols - 2) + 1;
let cN = c[j, i]; let cN = c[j, i];
let cS = c[j, iS[i] as u64]; let cS = c[j, iS as u64];
let cW = c[j, i]; let cW = c[j, i];
let cE = c[jE[j] as u64, i]; let cE = c[jE as u64, i];
let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i]; let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i];
image[j, i] = image[j, i] + 0.25 * lambda * D; image[j, i] = image[j, i] + 0.25 * lambda * D;
......
...@@ -1357,7 +1357,10 @@ fn interp_expr( ...@@ -1357,7 +1357,10 @@ fn interp_expr(
} }
} }
Value::Record { fields } => match fields.get(field) { Value::Record { fields } => match fields.get(field) {
None => Err(SchedulerError::UndefinedField(field.clone())), None => Err(SchedulerError::UndefinedField(format!(
"{} not in {:?}",
field, fields
))),
Some(v) => Ok((v.clone(), changed)), Some(v) => Ok((v.clone(), changed)),
}, },
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment