macro simpl!(X) { ccp(X); simplify-cfg(X); lift-dc-math(X); gvn(X); phi-elim(X); dce(X); infer-schedules(X); } simpl!(*); inline(layer_forward); delete-uncalled(*); no-memset(layer_forward@res, output_error@res, hidden_error@res); lift-dc-math(*); loop-bound-canon(*); simpl!(*); lift-dc-math(*); slf(*); fixpoint { forkify(*); fork-guard-elim(*); fork-coalesce(*); } reduce-slf(*); simpl!(*); fork-interchange[0, 1](adjust_weights); simpl!(*); infer-schedules(*); // The first call to layer_forward can be parallelized by 16 (the size of the // hidden layer) and the second can't be parallelized at all (the size of the // output layer is 1) inline(backprop@forward_input, backprop@forward_hidden); let forward_input = outline(backprop@forward_input); let forward_hidden = outline(backprop@forward_hidden); if !feature("seq") { fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); forward_input = outline(inner); inline(backprop@forward_input); } // The first call to adjust_weights has total loop dimensions of 1 * 17, so not // worth parallelizing (given that the body is trivial) // The second call to adjust_weights has a total dimension of 16 * (input + 1) // which is worth parallelizing, we'll do it by 16 inline(backprop@adjust_hidden, backprop@adjust_input); let adjust_hidden = outline(backprop@adjust_hidden); let adjust_input = outline(backprop@adjust_input); if !feature("seq") { fork-tile[16, 0, false, true](adjust_input); let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); adjust_input = outline(inner); inline(backprop@adjust_input); } delete-uncalled(*); const-inline(*); simpl!(*); fork-split(*); unforkify(output_error, hidden_error, adjust_hidden, adjust_input, forward_hidden, forward_input); simpl!(*); gcm(*);