Skip to content
Snippets Groups Projects
matmul.sch 1.28 KiB
Newer Older
  • Learn to ignore specific revisions
  • Aaron Councilman's avatar
    Aaron Councilman committed
    macro optimize!(X) {
      gvn(X);
      phi-elim(X);
      dce(X);
      ip-sroa(X);
      sroa(X);
      dce(X);
      gvn(X);
      phi-elim(X);
      dce(X);
    }
    
    macro codegen-prep!(X) {
      optimize!(X);
      gcm(X);
      float-collections(X);
      dce(X);
      gcm(X);
    }
    
    macro forkify!(X) {
      fixpoint {
        forkify(X);
        fork-guard-elim(X);
      }
    }
    
    macro fork-tile![n](X) {
      fork-tile[n, 0, false, true](X);
    }
    
    macro parallelize!(X) {
      parallel-fork(X);
      parallel-reduce(X);
    }
    
    macro unforkify!(X) {
      fork-split(X);
      unforkify(X);
    }
    
    optimize!(*);
    forkify!(*);
    
    if feature("cuda") {
      fixpoint {
        reduce-slf(*);
        slf(*);
        infer-schedules(*);
      }
      fork-coalesce(*);
      infer-schedules(*);
      dce(*);
    
    Russel Arbore's avatar
    Russel Arbore committed
      //rewrite(*);
    
    Russel Arbore's avatar
    Russel Arbore committed
      let out = outline(matmul@outer);
      gpu(out);
    
    Aaron Councilman's avatar
    Aaron Councilman committed
      fixpoint {
        simplify-cfg(*);
        dce(*);
      }
    
      optimize!(*);
      codegen-prep!(*);
    } else {
      associative(matmul@outer);
    
      // Parallelize by computing output array as 16 chunks
      let par = matmul@outer \ matmul@inner;
      fork-tile![4](par);
      let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
      parallelize!(outer \ inner);
    
      let body = outline(inner);
      cpu(body);
    
      // Tile for cache, assuming 64B cache lines
      fork-tile![16](body);
      let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
    
      reduce-slf(inner);
      unforkify!(body);
      codegen-prep!(*);
    }