Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
macro optimize!(X) {
gvn(X);
phi-elim(X);
dce(X);
ip-sroa(X);
sroa(X);
dce(X);
gvn(X);
phi-elim(X);
dce(X);
}
macro codegen-prep!(X) {
optimize!(X);
gcm(X);
float-collections(X);
dce(X);
gcm(X);
}
macro forkify!(X) {
fixpoint {
forkify(X);
fork-guard-elim(X);
}
}
macro fork-tile {
fork-tile[n, 0, false, true](X);
}
macro parallelize!(X) {
parallel-fork(X);
parallel-reduce(X);
}
macro unforkify!(X) {
fork-split(X);
unforkify(X);
}
optimize!(*);
forkify!(*);
if feature("cuda") {
fixpoint {
reduce-slf(*);
slf(*);
infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
optimize!(*);
codegen-prep!(*);
} else {
associative(matmul@outer);
// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-tile;
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
parallelize!(outer \ inner);
let body = outline(inner);
cpu(body);
// Tile for cache, assuming 64B cache lines
fork-tile;
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);
}