macro simpl!(X) {
  ccp(X);
  simplify-cfg(X);
  lift-dc-math(X);
  gvn(X);
  phi-elim(X);
  dce(X);
  infer-schedules(X);
}

simpl!(*);
inline(layer_forward);
delete-uncalled(*);

no-memset(layer_forward@res, output_error@res, hidden_error@res);
lift-dc-math(*);
loop-bound-canon(*);
simpl!(*);
lift-dc-math(*);
slf(*);
fixpoint {
  forkify(*);
  fork-guard-elim(*);
  fork-coalesce(*);
}
reduce-slf(*);
simpl!(*);
fork-interchange[0, 1](adjust_weights);
simpl!(*);

infer-schedules(*);

// The first call to layer_forward can be parallelized by 16 (the size of the
// hidden layer) and the second can't be parallelized at all (the size of the
// output layer is 1)
inline(backprop@forward_input, backprop@forward_hidden);
let forward_input = outline(backprop@forward_input);
let forward_hidden = outline(backprop@forward_hidden);

if !feature("seq") {
  fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
  let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
  forward_input = outline(inner);
  inline(backprop@forward_input);
}

// The first call to adjust_weights has total loop dimensions of 1 * 17, so not
// worth parallelizing (given that the body is trivial)
// The second call to adjust_weights has a total dimension of 16 * (input + 1)
// which is worth parallelizing, we'll do it by 16
inline(backprop@adjust_hidden, backprop@adjust_input);
let adjust_hidden = outline(backprop@adjust_hidden);
let adjust_input = outline(backprop@adjust_input);

if !feature("seq") {
  fork-tile[16, 0, false, true](adjust_input);
  let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
  adjust_input = outline(inner);
  inline(backprop@adjust_input);
}

delete-uncalled(*);
const-inline(*);

simpl!(*);
fork-split(*);
unforkify(output_error, hidden_error, adjust_hidden, adjust_input, forward_hidden, forward_input);
simpl!(*);

gcm(*);