diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/layers.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/layers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..01f40077d4f8342479d1965551af2d7e30a4c3f2
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/layers.txt
@@ -0,0 +1,13 @@
+conv  add  tanh
+conv  add  tanh  pool
+conv  add  tanh
+conv  add  tanh  pool
+conv  add  tanh
+conv  add  tanh  pool
+dense  add
+reduce
+conv
+conv
+conv
+reduce
+reduce
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/op_cost.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80ff2706a43e33b81af6d47e96f702efdfcb21b3
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet2_canny_hpvm/op_cost.txt
@@ -0,0 +1,13 @@
+468.076
+947.434
+255.422
+348.769
+256.658
+1.05427
+1.05427
+107.5062
+666.888
+432.622
+252.458
+11.51922
+2.01168
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/layer_composition.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/layer_composition.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b2bf962cd60722978b3205adca9c5822e59fc603
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/layer_composition.txt
@@ -0,0 +1,8 @@
+conv  add  activation  pool  
+conv  add  activation  pool  
+conv  add  activation  
+conv  add  activation  
+conv  add  activation  pool  
+dense  add  activation  
+dense  add  activation  
+dense  add  
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec3b8b5f375673e659594dca7ad8fd8ef6ace435
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/alexnet_imagenet/op_cost.txt
@@ -0,0 +1,8 @@
+1457111.000000
+4478976.000000
+2242805.750000
+2990407.750000
+1993605.125000
+754974.750000
+335544.312500
+81920.000000
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt
index 0a5aeb2cb5e58bc3a7e6c37205d841c17889dfb9..ee2cd80cb6e33da5e97ffe2e842644d7a705cdff 100644
--- a/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt
@@ -18,6 +18,24 @@ perf,135	4,1,0	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,136	4,1,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,137	4,1,2	1.33	tensorConvolution	tensorConvApprox	dev	conv
 perf,138	4,1,3	1.33	tensorConvolution	tensorConvApprox	dev	conv
+perf_fp16,151	1,2,0	3.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,152	1,2,1	3.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,153	2,1,0	3.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,154	2,1,1	3.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,155	1,3,0	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,156	1,3,1	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,157	1,3,2	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,158	3,1,0	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,159	3,1,1	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,160	3,1,2	2.25	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,161	1,4,0	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,162	1,4,1	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,163	1,4,2	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,164	1,4,3	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,165	4,1,0	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,166	4,1,1	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,167	4,1,2	2.0	tensorConvolution	tensorConvApprox	install	conv
+perf_fp16,168	4,1,3	2.0	tensorConvolution	tensorConvApprox	install	conv
 samp,231	2,0,1	2.0	tensorConvolution	tensorConvApprox	dev	conv
 samp,232	2,1,1	2.0	tensorConvolution	tensorConvApprox	dev	conv
 samp,233	3,0,1	1.5	tensorConvolution	tensorConvApprox	dev	conv
@@ -27,6 +45,15 @@ samp,236	4,0,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 samp,237	4,1,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 samp,238	4,2,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
 samp,239	4,3,1	1.33	tensorConvolution	tensorConvApprox	dev	conv
+samp_fp16,261	2,0,1	3.0	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,262	2,1,1	3.0	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,263	3,0,1	2.25	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,264	3,1,1	2.25	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,265	3,2,1	2.25	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,266	4,0,1	2.0	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,267	4,1,1	2.0	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,268	4,2,1	2.0	tensorConvolution	tensorConvApprox	install	conv
+samp_fp16,269	4,3,1	2.0	tensorConvolution	tensorConvApprox	install	conv
 red_samp,41	1	1.5	tensorReduction		tensorReduction		dev	red
 red_samp,42	1	2.25	tensorReduction		tensorReduction		dev	red
 red_samp,43	1	1.4	tensorReduction		tensorReduction		dev	red
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/layers.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/layers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a93fac1daed00254fca84258bc92e7788390fd93
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/layers.txt
@@ -0,0 +1,81 @@
+conv
+batchnorm
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+depthwise_conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+dense  add
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/op_cost.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..44d50dbe00baba66bd76bb7a0d2a9f37b8580fd4
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/mobilenet_torch/op_cost.txt
@@ -0,0 +1,15 @@
+44236.80078
+104857.6019
+104857.6019
+209715.2037
+104857.6019
+209715.2037
+104857.6019
+209715.2037
+209715.2037
+209715.2037
+209715.2037
+209715.2037
+104857.6019
+209715.2037
+256.000000
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/op_cost.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fb1aef66aaa4a02c5eb6f9282753a43c629f203
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/op_cost.txt
@@ -0,0 +1,21 @@
+88473.60156
+1887436.833
+1887436.833
+1887436.833
+1887436.833
+3774873.667
+1887436.833
+26214.40046
+1887436.833
+1887436.833
+3774873.667
+1887436.833
+13107.20023
+1887436.833
+1887436.833
+3774873.667
+1887436.833
+6553.600116
+1887436.833
+1887436.833
+64.0000000
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/resnet_layers.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/resnet_layers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2e51c67842656762091f2465b2824235a9959723
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/resnet18_torch/resnet_layers.txt
@@ -0,0 +1,59 @@
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+conv
+batchnorm
+conv
+batchnorm
+conv
+batchnorm
+activation
+conv
+batchnorm
+activation
+activation
+pool_mean
+dense  add
\ No newline at end of file
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/layers.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/layers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef3d0ebcf7c50b8a67a7c42cc71d4b69fe21fde2
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/layers.txt
@@ -0,0 +1,46 @@
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+pool
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+pool
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+pool
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+pool
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+conv  add
+batchnorm
+activation
+pool
+pool_mean
+dense  add
diff --git a/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/op_cost.txt b/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10dc83f865f3cc4ec02e86d4ae9f689eaa143610
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/autotuner/data/vgg16_cifar10_torch/op_cost.txt
@@ -0,0 +1,15 @@
+88473.60156
+1887436.833
+943718.4167
+1887436.833
+943718.4167
+1887436.833
+1887436.833
+943718.4167
+1887436.833
+1887436.833
+471859.2083
+471859.2083
+471859.2083
+13107.200195
+256.000000
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/alexnet2_cifar10/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/alexnet2_cifar10/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46165302552e71fd678600da85140d33101d298e
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/alexnet2_cifar10/quant_ranges.txt
@@ -0,0 +1,7 @@
+-1.8816435 2.0934134 -0.5421946 0.3710851 -0.06697306 0.040868897 -0.775027394891 0.779944300652 
+-0.775027394891 0.779944300652 -0.42474225 0.31460348 -0.3557253 -0.17281663 -0.808667064309 0.983953297734 
+-0.808667064309 0.983953297734 -0.44134507 0.79587924 -0.80424446 0.75330096 -0.995678424835 0.998566448689 
+-0.995678424835 0.998566448689 -0.2883836 0.31025785 -0.6353164 0.29015934 -0.993219196796 0.992379009724 
+-0.993219196796 0.992379009724 -0.2792431 0.37689754 -1.1379756 1.2391574 -0.999901354313 0.999910891056 
+-0.999901354313 0.999910891056 -0.27078503 0.27942517 -0.503003 0.12762362 -0.991036117375 0.971404970288 
+-0.991036117375 0.971404970288 -0.24273404 0.5845544 -0.53745 0.558251 -119.27973732 -25.2262819576
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/alexnet_cifar10/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/alexnet_cifar10/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..789a4114a5a468b3634506c4016b16b8b80c9131
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/alexnet_cifar10/quant_ranges.txt
@@ -0,0 +1,6 @@
+-1.88164262419 2.09340954985 -0.33087718 0.3323643 -0.7782218 0.6020472 -0.978641152382 0.998945295811 
+-0.978641152382 0.998945295811 -0.2095158 0.33543423 -0.45020863 0.30596754 -0.999703943729 0.999930202961 
+-0.999703943729 0.999930202961 -0.1715614 0.17037082 -0.6519161 0.5939945 -0.999933600426 0.999940037727 
+-0.999933600426 0.999940037727 -0.15575546 0.14456555 -0.55873865 0.4704539 -0.99999910593 0.999999344349 
+-0.99999910593 0.999999344349 -0.16108225 0.16864482 -0.22135437 0.10401678 -0.999434411526 0.999634206295 
+-0.999434411526 0.999634206295 -0.18183032 0.19018902 -0.07189204 0.106005594 -15.0765653801 19.4225852203 
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/lenet_mnist/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/lenet_mnist/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af4d13d6f8e6b5902ff743b07ef6875d644df91a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/lenet_mnist/quant_ranges.txt
@@ -0,0 +1,4 @@
+0 1 -1 1 -1 1 -1 1
+-1 1 -1 1 -1 1 -1 1
+-1 1 -1 1 -1 1 -1 1
+-1 1 -1 1 -1 1 -1 1
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/mobilenet/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/mobilenet/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ea66b8485dc19a8f2f9abfc5981e023f22ce521
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/mobilenet/quant_ranges.txt
@@ -0,0 +1,15 @@
+-1.9892114 2.126797 -2.19630692005 1.34758170414  0.0  0.0  -60.892750473 51.9925691605 
+0.0 5.71354155397 -0.931772116065 1.07742589378   0.0  0.0 -6.51858950329 6.81084251881 
+0.0 4.93213940287 -0.531654466152 0.57537904036   0.0  0.0  -4.48263123512 3.96730119753 
+0.0 4.10326339769 -0.362340988219 0.407691390038   0.0  0.0  -4.04261828327 3.8867793293 
+0.0 5.38322130251 -0.313120054901 0.293576799393   0.0  0.0  -5.92146921539 4.33867932415 
+0.0 4.31673815441 -0.232992478013 0.258029025793   0.0  0.0  -4.20778994751 3.93243697071 
+0.0 5.8304081068 -0.202337772191 0.189983081758   0.0  0.0  -6.29828691578 4.84813511753 
+0.0 4.44641780996 -0.174427356511 0.176958308667  0.0  0.0   -4.34791088581 3.61443646955 
+0.0 4.5180956049 -0.145467961878 0.15256431669   0.0  0.0   -3.02877027559 2.94873657799 
+0.0 6.34857563496 -0.130258745223 0.135582433432   0.0  0.0  -4.22931008053 3.53150463724 
+0.0 5.22100311041 -0.119001727596 0.125363747835   0.0  0.0  -4.03820378017 4.00400940704 
+0.0 5.73249834776 -0.108397216856 0.116256686077    0.0  0.0  -3.31110151148 4.46293323326 
+0.0 7.24049821186 -0.0862374496162 0.0885944995135   0.0  0.0  -4.17543139458 6.2043294754 
+0.0 7.81395883465 -0.0681302513927 0.0700202777982    0.0  0.0  -10.9205664234 2.64429125786 
+0.0 2.86920666504 -0.223010196954 0.14426593782 -0.1654396 0.23336112 -12.2459499588 23.8053251343
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/resnet18_cifar10/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/resnet18_cifar10/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af0279b1d2980d8c8d71f20f3ef8c3f3da585699
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/resnet18_cifar10/quant_ranges.txt
@@ -0,0 +1,22 @@
+-0.5500815 0.60786617 -1.0248864 1.2929907 -0.36291853 0.2533059 0.0 0.753551840782 
+0.0 0.753551840782 -0.69884616 0.71849966 -0.2781147 0.45571187 0.0 1.01057458043 
+0.0 1.01057458043 -0.59568167 0.7714691 -0.8602873 0.19743633 -1.84771883726 1.87930787086 
+0.0 2.33981014252 -0.41976976 0.43748936 -0.7021962 0.3033103 0.0 1.04317724705 
+0.0 1.04317724705 -0.46757826 0.4635873 -0.20662616 0.1778044 -0.829483509064 0.786805033684 
+0.0 2.49733686686 -0.64404047 0.45383143 -0.819547 0.38550296 0.0 0.897360802293 
+0.0 0.897360802293 -0.41986948 0.33654243 -0.3563013 0.22371122 -0.957150224447 0.54919362247 
+0.0 2.37362146616 -0.4805263 0.50655717 -0.296758 0.7742441 0.0 3.01592136621 
+0.0 3.01592136621 -0.52083415 0.45517674 -0.20242067 0.8236838 -5.2759475708 5.79733039856 
+0.0 2.37362146616 -0.5338656 1.3395424 -0.20242067 0.8236838 -0.738995380998 2.33600783587 
+0.0 7.07933432579 -0.34429058 0.43629733 -1.0744808 0.056708273 0.0 1.58645607233 
+0.0 1.58645607233 -0.30342352 0.39493486 -0.44630566 0.6492069 -1.49672914267 1.29970229745 
+0.0 7.11914063454 -0.38351893 0.45775774 -1.4733055 -0.014426912 0.0 1.52876508832 
+0.0 1.52876508832 -0.25695276 0.45372736 -0.5259744 0.26591402 -1.59576894164 1.08074297309 
+0.0 6.94405080318 -0.55299705 0.5443531 -0.71790683 1.2730768 0.0 10.3651468277 
+0.0 10.3651468277 -0.4203967 0.48641303 -0.90653443 1.3546854 -22.372925148 17.2033731079 
+0.0 6.94405080318 -0.4365755 0.84913826 -0.90653443 1.3546851 -3.66810325861 4.87814051151 
+0.0 18.8401451111 -0.38657624 0.5228989 -1.2083547 0.76361173 0.0 19.1229192352 
+0.0 19.1229192352 -0.40857902 0.575035 -1.8731614 1.0960501 -31.3229312897 14.8234729958 
+0.0 23.7382488823 -0.33079496 0.5893278 -1.0234511 1.0016295 0.0 19.5892774963 
+0.0 19.5892774963 -0.27897888 0.38280907 -2.2086356 1.0066502 -34.4416886902 20.9890329933 
+0.0 10.8541981602 -1.5092047 1.0279838 -0.49379802 0.61032647 -40.9121678543 25.7082381058
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar10/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar10/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b742502f145c535db5432c0f6a0de27ba3ed3979
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar10/quant_ranges.txt
@@ -0,0 +1,15 @@
+-1.8816367 2.0934217 -0.53275156 0.49437004 -0.6403629 0.2490165 0.0 1.35908746719 
+0.0 1.35908746719 -0.2688396 0.20639156 -0.7745511 0.82006615 0.0 2.52123117924 
+0.0 2.52123117924 -0.16776876 0.14878987 -0.35283303 0.5154362 0.0 1.20119857848 
+0.0 1.20119857848 -0.088948585 0.114222586 -0.30250227 0.36856708 0.0 1.03598809302 
+0.0 1.03598809302 -0.07739562 0.10973293 -0.15568458 0.17634983 0.0 0.300495595038 
+0.0 0.300495595038 -0.051649556 0.05435231 -0.07395447 0.07996062 0.0 0.11490475405 
+0.0 0.11490475405 -0.043513633 0.07577866 -0.06921874 0.02660573 0.0 0.16232508488 
+0.0 0.16232508488 -0.033842053 0.045218028 -0.022827804 0.023845317 0.0 0.124249965735 
+0.0 0.124249965735 -0.02211613 0.032084666 -0.02699063 0.03773564 0.0 0.174634486511 
+0.0 0.174634486511 -0.01979376 0.034854397 -0.036107242 0.07056531 0.0 0.575175762177 
+0.0 0.575175762177 -0.03452098 0.046055835 -0.051925894 0.07039055 0.0 0.771875114441 
+0.0 0.771875114441 -0.025946895 0.040090334 -0.06049362 0.12658806 0.0 1.17285169065 
+0.0 1.17285169065 -0.021766115 0.03315237 -0.20705001 0.117947325 0.0 2.00157693863 
+0.0 2.00157693863 -0.042597745 0.046707444 -0.21937433 0.2545502 0.0 2.00236111879 
+0.0 2.00236111879 -0.32550547 0.30829763 -1.1787822 1.2378151 -18.2514705467 24.1736344528
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar100/quant_ranges.txt b/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar100/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e614e1664822d2ecf6fa426a7eb2fd7c362a2e7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/model_params/vgg16_cifar100/quant_ranges.txt
@@ -0,0 +1,15 @@
+-1.7829767 1.9456929 -0.7450515 0.71249133 -1.5885142 0.275554 0.0 8.190712 
+0.0 8.190712 -0.30790088 0.43504623 -1.4242363 1.2602744 0.0 19.023172 
+0.0 19.023172 -0.29189092 0.26958522 -1.0527138 0.9075671 0.0 14.428051 
+0.0 14.428051 -0.15521508 0.1829038 -0.845419 1.9358484 0.0 23.065294 
+0.0 23.065294 -0.13149762 0.14811686 -0.7162557 1.0370971 0.0 15.165984 
+0.0 15.165984 -0.06236292 0.08321518 -0.9067523 0.9922458 0.0 13.664733 
+0.0 13.664733 -0.06471479 0.1024472 -0.15943134 0.7988499 0.0 19.025272 
+0.0 19.025272 -0.06320205 0.08291938 -0.32540628 0.5203079 0.0 6.727217 
+0.0 6.727217 -0.037707984 0.051601283 -0.25622904 0.11251946 0.0 3.2003012 
+0.0 3.2003012 -0.056007143 0.09549151 -0.11591503 0.06267536 0.0 4.321189 
+0.0 4.321189 -0.060094673 0.10868926 -0.105962686 0.09584572 0.0 2.936297 
+0.0 2.936297 -0.034618977 0.05792674 -0.4237576 0.11035452 0.0 4.87262 
+0.0 4.87262 -0.035480656 0.058295887 -0.21477045 0.14263579 0.0 10.32133 
+0.0 10.32133 -0.08929961 0.11301676 -0.20798548 0.47405547 0.0 13.91 
+0.0 13.91 -0.6627122 0.35539475 -1.0631907 0.9830786 -70.45701 87.34367 
diff --git a/llvm/projects/pred_tuner/.gitignore b/llvm/projects/pred_tuner/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..23e6d258015162d516c02fecb0a4f87acf4fb73d
--- /dev/null
+++ b/llvm/projects/pred_tuner/.gitignore
@@ -0,0 +1,28 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Opentuner
+opentuner.db/
+opentuner.log
+
+# Custom
+.idea/
+.vscode/
+/data/
+results/
+tuner_results
+tuner_results/
+*.sh
+*.ipynb
+logistics/
+autotuner/
diff --git a/llvm/projects/pred_tuner/LICENSE b/llvm/projects/pred_tuner/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2e229faa39851c4ddf71b0284c7e56a02dfd577a
--- /dev/null
+++ b/llvm/projects/pred_tuner/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 liukuang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/llvm/projects/pred_tuner/README.md b/llvm/projects/pred_tuner/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d7a6db2bdc622e6cac73c56e443e8d3e797133c
--- /dev/null
+++ b/llvm/projects/pred_tuner/README.md
@@ -0,0 +1,93 @@
+# Autotuning with Error-predictive Proxy
+
+Performs autotuning on program approximation knobs using an error-predictive proxy in place of the original
+program, to greatly speedup autotuning while getting results comparable in quality.
+
+Work in progress.
+
+## Getting Started
+
+After finishing this readme, go to [./proxy_tuner.py](./proxy_tuner.py) to try tuning one
+model. Use this set of arguments for a start:
+
+```bash
+python proxy_tuner.py --test-limit 1000 --accuracy-drop 1.5 --accuracy-slack 2.1 \
+-o tuner_output alexnet2 autotuner/data/alexnet2
+```
+
+## Supported Programs & Approximations
+
+### Programs
+
+Currently DNN only. Support for several image processing benchmarks are in progress.
+
+Supported DNNs:
+
+- `LeNet @ MNIST`
+
+- `AlexNet @ CIFAR-10`
+
+- `AlexNet2 @ CIFAR-10`
+
+- `VGG16 @ CIFAR-10`
+
+- `ResNet18 @ CIFAR-10`
+
+- `MobileNet @ CIFAR-10`
+
+- `VGG16 @ CIFAR-100`
+
+- `VGG16 @ ImageNet`
+
+- `ResNet50 @ ImageNet`
+
+### Approximations
+
+Currently _hardware-independent_ approximations only. Hardware-reliant approximations are in progress.
+
+Approximations: (output) perforation for convolution, kernel sampling for convolution.
+
+## Proxy Model
+
+TODO: add working principle of proxy modeling.
+
+## Autotuner
+
+We use [opentuner](http://opentuner.org/) for autontuning tasks.
+
+## Project Structure
+
+### Library
+
+- `models`: PyTorch definition for DNN models
+
+  - `models/dataset`: Dataset loaders for both HPVM and PyTorch-standard DNN models
+
+  - `models/hpvm`: Definition for HPVM-ported models, with customized convolution layers
+
+- `toolkit`: core code of project, including DNN indexing / transformations / approximations. See
+  the code for details.
+
+### Entry Point
+
+- `./proxy_tuner.py`: perform autotuning for a given model, accuracy threshold, and a number of iterations,
+  using a proxy model that predicts the accuracy of approximated DNN (instead of running an inference, which
+  can be slow).
+
+- `./run_proxy_tuner.py`: run autotuning for all models defined in `utils/tuner_postprocess/benchmarks.py` on
+  a set of 3 accuracy thresholds, and perform postprocessing such as computing pareto curve.
+  
+  This is the right end-to-end script to use for obtaining a comprehensive set of autotuner results.
+
+### Other Code
+
+- `tests`: runnable scripts that can be used as tests (and other actual functionalities)
+
+- `utils`: helper functions for library and autotuner that are generally standalone, except
+
+  - `utils/utils.py` contains some convenient wrapper for model training, etc. that depends on the library.
+
+### Data
+
+- `autotuner/data`: descriptions for each DNN model, such as listing of layers, tunable
+  knobs, etc.
diff --git a/llvm/projects/pred_tuner/bin/benchmark.py b/llvm/projects/pred_tuner/bin/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c8b2de5262469d9b752b5a2acd28db55e464a5
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/benchmark.py
@@ -0,0 +1,111 @@
+import gc
+from time import time
+from typing import Dict, Iterator, List
+
+import numpy
+from tqdm import tqdm
+
+from exp import Benchmark, bench_tuner_data
+from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, LinearQoSEstimator, ModuleIndexer, \
+    NetApproxSelector
+from utils import gpu_mem_mb, init_by_name, nn_to_output, tensor_to_accuracy
+
+
+def generate_random_configs(layer_approxes: Dict[int, List[int]], n_configs: int) -> Iterator[ConfigT]:
+    from numpy.random import choice
+    from random import randrange
+    all_layers = [k for k, ns in layer_approxes.items() if ns]
+    for _ in range(n_configs):
+        config = {}
+        n_approx_layers_ = randrange(len(all_layers) + 1)
+        approx_layers = choice(all_layers, n_approx_layers_, replace=False)
+        for layer_idx in approx_layers:
+            config[layer_idx] = choice(layer_approxes[layer_idx], 1)[0]
+        yield config
+
+
+def time_action(action):
+    tt0 = time()
+    action()
+    tt1 = time()
+    return tt1 - tt0
+
+
+def mean_std_str(np_array):
+    return f"{np_array.mean():.7f} +- {np_array.std():.7f}"
+
+
+def main_loop(bench, baseline_dag, testloader):
+    _t_baseline_inf = time()
+    baseline_output = nn_to_output(baseline_dag.module, testloader)
+    baseline_acc = tensor_to_accuracy(baseline_output, testloader)
+    print(f"Model accuracy: {baseline_acc}; test set size: {baseline_output.size(0)}")
+    t_baseline_inf = time() - _t_baseline_inf
+    nas = NetApproxSelector(baseline_dag)
+
+    def acc_crit(inputs_):
+        return tensor_to_accuracy(inputs_, testloader)
+
+    def threshold_eval(inputs_):
+        import numpy as np
+        accs = np.array([acc_crit(x) for x in inputs_])
+        return baseline_acc - accs.mean() < 3.0
+
+    def run_model(net):
+        return nn_to_output(net, testloader)
+
+    _t_profile = time()
+    pickle_path = bench.result_dir / 'proxy.pkl'
+    f1 = LinearCombEstimator(
+        nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False
+    )
+    f2 = LinearQoSEstimator(
+        nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False
+    )
+    LinearEstimator.coinit_estimators(nas, run_model, threshold_eval, f1, f2, storage=pickle_path)
+    t_profile = time() - _t_profile
+    print(
+        f"Baseline inference time: {t_baseline_inf:.3f} sec, predictor init time: {t_profile:.3f} sec; "
+        f"Predictor init time is {t_profile / t_baseline_inf:.3f} times of inference time"
+    )
+    configs = generate_random_configs(nas.net_approxes, 30)
+    pbar = tqdm(configs)
+    times = []
+    for config in pbar:
+        pbar.set_postfix(mem=gpu_mem_mb())
+        approx = nas.apply_approx_by_config(config).module
+        t_inf = time_action(lambda: nn_to_output(approx, testloader))
+        t_f1 = time_action(lambda: f1.estimate(config))
+        t_f2 = time_action(lambda: f2.estimate(config))
+        pbar.write(
+            f"Inference time: {t_inf:.3f} sec, predictors time: {t_f1:.3f} | {t_f2:.3f} sec"
+        )
+        times.append([t_inf, t_f1, t_f2])
+        gc.collect()
+    times = numpy.array(times)
+    s_inf, s0, s1 = numpy.apply_along_axis(mean_std_str, 0, times)
+    print(f"Result: inference time {s_inf}, predictor time: {s0} | {s1}")
+    print("Timing raw data:", times)
+
+
+def main():
+    for network in (
+            'alexnet_hpvm', 'alexnet2_hpvm',
+            'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm',
+            'mobilenet_hpvm',
+            'resnet18_hpvm',
+            'lenet_hpvm',
+            'vgg16_imagenet_hpvm',
+            'alexnet_imagenet_hpvm',
+            # 'resnet50_imagenet_hpvm',
+    ):
+        bench: Benchmark = bench_tuner_data[network]
+        print(f"{network}: ")
+        baseline, testloader, _, shapes = init_by_name(network)
+        baseline_dag = ModuleIndexer(baseline)
+        main_loop(bench, baseline_dag, testloader)
+        gc.collect()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/discrepancy.py b/llvm/projects/pred_tuner/bin/discrepancy.py
new file mode 100644
index 0000000000000000000000000000000000000000..8be92df66ae3a2bcb2d33088bb20064404d37913
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/discrepancy.py
@@ -0,0 +1,53 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+import matplotlib.pyplot as plt
+import seaborn
+import torch
+from tqdm import tqdm
+
+from toolkit import ModuleIndexer, NetApproxSelector, StateCapturer
+from utils import device, init_by_name
+
+
+def run_concat_output_at(net_index: ModuleIndexer, testloader, layer: int) -> Optional[torch.Tensor]:
+    snet = StateCapturer(net_index, lambda i, x: x.clone().detach() if i == layer else None)
+    for inputs, targets in testloader:
+        inputs, targets = inputs.to(device), targets.to(device)
+        snet(inputs)
+    outputs = snet.net_state[layer]
+    return torch.cat(outputs) if outputs else None
+
+
+def get_discrepancy_for(baseline, approxed, testloader, changed_layer):
+    baseline_output = run_concat_output_at(baseline, testloader, changed_layer)
+    approxed_output = run_concat_output_at(approxed, testloader, changed_layer)
+    assert baseline_output.shape == approxed_output.shape
+    tqdm.write(f"{baseline_output.size()}")
+    diff = baseline_output - approxed_output
+    diff_rel = torch.abs(diff / baseline_output).cpu()
+    diff_rel[torch.isnan(diff_rel)] = 0
+    diff_rel[diff_rel > 10] = 10
+    return diff_rel
+
+
+def main():
+    prefix = Path('results/discrepancy/resnet50_imagenet_hpvm')
+    os.makedirs(prefix, exist_ok=True)
+    baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm')
+    net_index = ModuleIndexer(baseline)
+    nas = NetApproxSelector(net_index)
+    total = sum(len(ns) for ns in nas.net_approxes.values())
+    for layer, approx, approxed_net_dag in tqdm(nas.apply_indep_approx(), total=total):
+        if approx == 11:
+            continue
+        diff_rel = get_discrepancy_for(net_index, approxed_net_dag, testloader, layer)
+        fig, ax = plt.subplots()
+        seaborn.heatmap(diff_rel.mean(0).mean(0).numpy(), ax=ax)
+        fig.savefig((prefix / f'{layer}_{approx}.png').open('wb'), dpi=200)
+        plt.close(fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/filter_configs.py b/llvm/projects/pred_tuner/bin/filter_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf23668b81ff0bdf071d27d9e010932ab07e6eea
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/filter_configs.py
@@ -0,0 +1,54 @@
+from typing import List, Tuple
+
+from exp import Benchmark, ExpState, bench_tuner_data
+from utils.config import Config
+
+
+def filter_configs(
+        validation: List[Config], test: List[Config],
+        vali_threshold: float, test_threshold: float = 3.0
+) -> Tuple[List[Config], List[Config]]:
+    # Filter validation and test set by their respective thresholds
+    filtered_validation = [
+        c for c in validation if c.avg_loss <= vali_threshold
+    ]
+    filtered_test = [
+        c for c in test if c.avg_loss <= test_threshold
+    ]
+    # Test configs also need to be a subset of validation configs.
+    name_to_filtered = {x.fname: x for x in filtered_test}
+    intersect_names = set(list(name_to_filtered.keys())).intersection(
+        set((x.fname for x in filtered_validation))
+    )
+    filtered_test_ = [name_to_filtered[fname] for fname in intersect_names]
+    assert set([id(x) for x in filtered_test_]).issubset(set([id(x) for x in filtered_test]))
+    return filtered_validation, filtered_test_
+
+
+def process_configs(bench: Benchmark, calib_slack: float, states: ExpState):
+    validated_configs = states.validated_configs.configs
+    tested_configs = states.tested_configs.configs
+    old_len = len(validated_configs)
+    valid_configs, test_configs = filter_configs(
+        validated_configs, tested_configs, calib_slack
+    )
+    states.valid_configs.finalize_dump(valid_configs)
+    states.test_configs.finalize_dump(test_configs)
+    print(f"{bench.model_name}: {old_len} -> {len(validated_configs)}, {len(tested_configs)}")
+    # Finalize data input and plot everything.
+    states.finalize_plot()
+
+
+def main():
+    for bench in bench_tuner_data.values():
+        bench: Benchmark
+        try:
+            states = ExpState(bench)
+        except ValueError:
+            print(f"Model {bench.model_name} has incomplete experiment data; skipping")
+            continue
+        process_configs(bench, 2.1, states)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/inferences.py b/llvm/projects/pred_tuner/bin/inferences.py
new file mode 100644
index 0000000000000000000000000000000000000000..065abfd223f0a5c234dd36cc8aca7324415ac96f
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/inferences.py
@@ -0,0 +1,9 @@
+from tqdm import tqdm
+
+from models import BaselineInfo, networks
+from utils import device
+
+if __name__ == '__main__':
+    for net_name in networks:
+        baseline_info = BaselineInfo.init_by_name(net_name, device)
+        tqdm.write(f"{net_name}: {baseline_info.val_qos} (validation) {baseline_info.test_qos} (test")
diff --git a/llvm/projects/pred_tuner/bin/mock_autotuner.py b/llvm/projects/pred_tuner/bin/mock_autotuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec12e1643ab319e0120f2e95c7801825f04484bb
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/mock_autotuner.py
@@ -0,0 +1,230 @@
+import gc
+import json
+import os
+from pathlib import Path
+from sys import argv
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm import tqdm, trange
+
+from exp import Benchmark, bench_tuner_data
+from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, \
+    LinearQoSEstimator, ModuleIndexer, NetApproxSelector, WeightedLinearCombEstimator
+from toolkit.estimators import WeightedLinearQoSEstimator
+from utils import config_pylogger, gpu_mem_mb, init_by_name, nn_to_accuracy, nn_to_output, qos_stats, tensor_to_accuracy
+
+msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True)
+
+
+class Evaluator:
+    def __init__(
+            self, nas: NetApproxSelector, n_approx_layers: Optional[int],
+            n_configs: int, testloader, threshold: Optional[float]
+    ):
+        self.nas = nas
+        self.layer_approxes = nas.net_approxes
+        self.n_approx_layers = n_approx_layers
+        self.n_configs = n_configs
+        self.testloader = testloader
+        self.threshold = threshold
+        self.config_accs = None
+
+    def generate_random_configs(self) -> Iterator[ConfigT]:
+        from numpy.random import choice
+        from random import randrange
+        all_layers = [k for k, ns in self.layer_approxes.items() if ns]
+        for _ in range(self.n_configs):
+            config = {}
+            if self.n_approx_layers is None:
+                n_approx_layers_ = randrange(len(all_layers) + 1)
+            else:
+                n_approx_layers_ = min(self.n_approx_layers, len(all_layers))
+            approx_layers = choice(all_layers, n_approx_layers_, replace=False)
+            for layer_idx in approx_layers:
+                config[layer_idx] = choice(self.layer_approxes[layer_idx], 1)[0]
+            yield config
+
+    def evaluate_config(self, config: ConfigT) -> Tuple[float, float]:
+        deterministic = self.nas.is_deterministic(config)
+        n_runs = 1 if deterministic else 30
+        approxed = self.nas.apply_approx_by_config(config).module
+        accs = []
+        for _ in trange(n_runs, leave=None):
+            acc = nn_to_accuracy(approxed, self.testloader)
+            accs.append(acc)
+        mean, confident_acc, _ = qos_stats(accs, 0.95)
+        return mean, confident_acc
+
+    def sort_configs_by_mean_acc(self):
+        sorted_ = sorted(self.config_accs, key=lambda p: p[1], reverse=True)
+        from itertools import takewhile
+        if self.threshold is not None:
+            sorted_ = list(takewhile(lambda p: p[1] > self.threshold, sorted_))
+        self.config_accs = np.array(sorted_)
+
+    @staticmethod
+    def calculate_perm_dist(pred_order):
+        n = len(pred_order)
+        actual_order = np.arange(n)
+        return np.linalg.norm(actual_order - pred_order, ord=1) / ((n ** 2 - 1) / 3)
+
+    def use_predictors(self, predictors: Iterable[LinearEstimator]) -> \
+            Optional[List[Tuple[np.ndarray, np.ndarray]]]:
+        self.sort_configs_by_mean_acc()
+        if len(self.config_accs) == 0:
+            return None
+        configs = self.config_accs[:, 0]
+        raw_prediction = []
+        for predictor in predictors:
+            # N * 2 array: avg acc, 95% confidence acc
+            pred_accs = np.array([
+                predictor.estimate(config) for config in configs
+            ])
+            pred_order = (-pred_accs[:, 0]).argsort(kind='stable')
+            raw_prediction.append((pred_accs, pred_order))
+        return raw_prediction
+
+    def run_configs(self):
+        configs = self.generate_random_configs()
+        pbar = tqdm(configs)
+        config_accs = []
+        for config in pbar:
+            pbar.set_postfix(mem=gpu_mem_mb())
+            mean_acc, confident_acc = self.evaluate_config(config)
+            config_accs.append([config, mean_acc, confident_acc])
+            gc.collect()
+        self.config_accs = np.array(config_accs)
+
+
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+
+
+class DataPlotStorage:
+    def __init__(self, save_to_prefix: Path):
+        self.save_to = save_to_prefix
+        os.makedirs(self.save_to.parent, exist_ok=True)
+        self.args = []
+        self.fig, self.axes = plt.subplots()
+
+    def plot(self, *args, **kwargs):
+        self.args.append({'args': args, 'kwargs': kwargs})
+        self.axes.plot(*args, **kwargs)
+
+    def errorbar(self, *args, **kwargs):
+        self.args.append({'args': args, 'kwargs': kwargs})
+        self.axes.errorbar(*args, **kwargs)
+
+    def save_and_close(self):
+        self.fig.savefig(self.save_to.with_suffix('.png'), dpi=200)
+        with self.save_to.with_suffix('.json').open('w') as f:
+            json.dump(self.args, f, cls=NumpyEncoder)
+        plt.close(self.fig)
+
+
+def compare_estimators(
+        eva: Evaluator, predictors: Dict[str, LinearEstimator], n_runs: int, st: DataPlotStorage
+):
+    all_dists = []
+    for _ in trange(n_runs):
+        eva.run_configs()
+        raw_predictions = eva.use_predictors(predictors.values())
+        dists = [eva.calculate_perm_dist(order) for _, order in raw_predictions]
+        all_dists.append(dists)
+    dists_t = zip(*all_dists)
+    for vs, label in zip(dists_t, predictors.keys()):
+        st.plot(sorted(vs), label=label)
+    st.axes.set_ylim(bottom=0)
+    st.fig.legend()
+    st.save_and_close()
+
+
+def plot_acc_estm_discrepancy(
+        eva: Evaluator, predictors: Dict[str, LinearEstimator], st: DataPlotStorage
+):
+    eva.run_configs()
+    raw_predictions = eva.use_predictors(predictors.values())
+    if not raw_predictions:
+        return
+    measured_mean_accs = eva.config_accs[:, 1]
+    yerr = measured_mean_accs - eva.config_accs[:, 2]
+    st.errorbar(
+        measured_mean_accs, measured_mean_accs, fmt='.', yerr=yerr, uplims=True, label='baseline'
+    )
+    for (pred_accs, _), label in zip(raw_predictions, predictors.keys()):
+        pred_accs = pred_accs
+        yerr = pred_accs[:, 0] - pred_accs[:, 1]
+        st.errorbar(
+            measured_mean_accs, pred_accs[:, 0],
+            fmt='.', yerr=yerr, uplims=True, label=label
+        )
+    min_x, max_x = np.min(measured_mean_accs), np.max(measured_mean_accs)
+    diag_x = np.linspace(min_x, max_x, 500)
+    st.errorbar(diag_x, diag_x, linewidth=1)
+    st.axes.set_xlabel('Measured accuracy (%)')
+    st.axes.set_ylabel('Predicted accuracy (%)')
+    st.fig.legend()
+    st.save_and_close()
+
+
+def train_predictors(eva: Evaluator, *predictors: LinearEstimator):
+    for conf in eva.generate_random_configs():
+        for p in predictors:
+            p.estimate(conf)
+
+
+def main():
+    base_path = Path(argv[1]) if len(argv) > 1 else Path('results/mock_autotuner')
+
+    for network in (
+            'alexnet2_hpvm', 'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm',
+            'mobilenet_hpvm',
+            'resnet18_hpvm',
+            'vgg16_imagenet_hpvm', 'resnet50_imagenet_hpvm'
+    ):
+        bench: Benchmark = bench_tuner_data[network]
+        print(f"{bench.model_name}: ")
+        baseline, testloader, _, shapes = init_by_name(bench.model_name)
+        baseline_dag = ModuleIndexer(baseline)
+        baseline_acc = nn_to_accuracy(baseline_dag.module, testloader)
+        nas = NetApproxSelector(baseline_dag)
+
+        def acc_crit(inputs_):
+            return tensor_to_accuracy(inputs_, testloader)
+
+        def threshold_eval(inputs_):
+            accs = np.array([acc_crit(x) for x in inputs_])
+            return baseline_acc - accs.mean() < 3.0
+
+        def run_model(net):
+            return nn_to_output(net, testloader)
+
+        f1 = LinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
+        f2 = LinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
+        f3 = WeightedLinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
+        f4 = WeightedLinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
+        LinearEstimator.coinit_estimators(
+            nas, run_model, threshold_eval, f1, f2, f3, f4,
+            storage=Path('model_params/pickles') / Path(bench.base_dir).name / 'proxy_dev.pkl'
+        )
+        train_predictors(Evaluator(nas, None, 700, testloader, baseline_acc), f3, f4)
+        st = DataPlotStorage(base_path / "cmp_acc_diff" / f"{bench.model_name}")
+        plot_acc_estm_discrepancy(
+            Evaluator(nas, None, 200, testloader, baseline_acc - 10),
+            {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, st
+        )
+        st = DataPlotStorage(base_path / 'cmp_ordering' / f"{bench.model_name}" / "n_none")
+        compare_estimators(
+            Evaluator(nas, None, 20, testloader, None),
+            {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, 10, st
+        )
+        gc.collect()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/print_approxes.py b/llvm/projects/pred_tuner/bin/print_approxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95d080326ad2e806d772454c15bed68c573ca17
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/print_approxes.py
@@ -0,0 +1,35 @@
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn
+from tqdm import tqdm
+
+from models.domains import Accuracy
+from models import BaselineInfo
+from toolkit import NetApproxSelector
+from utils import device
+
+
+def main():
+    baseline_info = BaselineInfo.init_by_name('mobilenet_hpvm', device)
+    nas = NetApproxSelector(baseline_info.baseline_net, dev_time_only=True, ignore_fp32=False)
+    table = defaultdict(dict)
+    pbar = tqdm(nas.list_single_approxes())
+    for layer, approx, _ in pbar:
+        pbar.set_postfix(k=layer, i=approx)
+        approxed_net = nas.apply_approx_by_config({layer: approx}).module
+        acc: Accuracy = baseline_info.get_qos(approxed_net, baseline_info.val_loader)
+        table[layer][approx] = acc.to_scalar()
+    df = pd.DataFrame(
+        [pd.Series(list(d.values()), index=d.keys()) for d in table.values()],
+        index=list(table.keys())
+    )
+    with open('accuracy.json', 'w') as f:
+        df.to_json(f)
+    seaborn.heatmap(df.to_numpy())
+    plt.savefig('accuracy.png', dpi=200)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/progress_graph.py b/llvm/projects/pred_tuner/bin/progress_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7d0d5526f708e8049e3f185ebceebe68f4b778
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/progress_graph.py
@@ -0,0 +1,61 @@
+from itertools import groupby
+from operator import itemgetter
+from pathlib import Path
+from typing import Tuple
+
+import matplotlib.pyplot as plt
+
+from exp import Benchmark, ExpState, batch_id, bench_tuner_data
+from utils import Config
+
+
+def finalize_figs(filename, ax, fig):
+    ax.legend()
+    ax.set_ylim(bottom=1.0)
+    fig.savefig(filename, dpi=200)
+    plt.close(fig)
+
+
+def process_configs(bench: Benchmark, states: ExpState, shared_ax):
+    def get_features(c: Config) -> Tuple[int, int, float]:
+        *_, run_s, iter_s = c.fname.split('_')
+        return int(run_s), int(iter_s), c.speedup
+
+    def get_max_speedup(group):
+        group = sorted(list(group), key=itemgetter(1))
+        iter_max_speedup = []
+        max_speedup = 0
+        for _, i, speedup in group:
+            max_speedup = max(max_speedup, speedup)
+            iter_max_speedup.append((i, max_speedup))
+        return iter_max_speedup
+
+    run_iter_speedup = sorted(
+        [get_features(c) for c in states.all_configs.configs], key=itemgetter(0)
+    )
+    run_groups = groupby(run_iter_speedup, key=itemgetter(0))
+    fig, ax = plt.subplots()
+    for run, run_group in run_groups:
+        iter_max_speedup = get_max_speedup(run_group)
+        iters, max_speedups = zip(*iter_max_speedup)
+        ax.plot(iters, max_speedups, label=f"loss={run + 1}%")
+        if run + 1 == 3:
+            shared_ax.plot(iters, max_speedups, label=f"{bench.model_name.replace('_hpvm', '')}")
+    finalize_figs(bench.result_dir / f"tuner_progress.png", ax, fig)
+
+
+def main():
+    fig, ax = plt.subplots()
+    for bench in bench_tuner_data.values():
+        bench: Benchmark
+        try:
+            states = ExpState(bench)
+        except ValueError:
+            print(f"Model {bench.model_name} has incomplete experiment data; skipping")
+            continue
+        process_configs(bench, states, ax)
+    finalize_figs(Path("results") / f"{batch_id}_tuner_progress.png", ax, fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/bin/train_model.py b/llvm/projects/pred_tuner/bin/train_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d0d80725f5784c42ec8f6a26b65ff183df1649
--- /dev/null
+++ b/llvm/projects/pred_tuner/bin/train_model.py
@@ -0,0 +1,186 @@
+"""Train CIFAR10 with PyTorch."""
+import argparse
+import os
+from typing import List
+
+import numpy as np
+import torch
+from torch import optim
+from torch.nn import CrossEntropyLoss, Module
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from tqdm import tqdm
+
+from models.torch import ResNet18
+from models.datasets import get_cifar10_train_dataloader, get_cifar10_test_dataloader
+from utils import device
+
+
+class RunningStats:
+    def __init__(self, criterion):
+        self.criterion = criterion
+        self.all_outputs = None
+        self.all_targets = np.zeros([0])
+        self.avg_loss, self.correct, self.total = 0, 0, 0
+        self.conf_mat = None
+        self.n_batches = 0
+
+    @property
+    def n_classes(self):
+        if self.all_outputs is None:
+            raise RuntimeError("Num of classes is unknown before seeing first input")
+        return self.all_outputs.shape[1]
+
+    def setup_for_first_output(self, outputs):
+        n_classes = outputs.shape[1]
+        self.all_outputs = np.zeros([0, n_classes])
+        self.conf_mat = np.zeros([n_classes, n_classes])
+
+    def add_output(self, outputs, targets):
+        if self.all_outputs is None:
+            self.setup_for_first_output(outputs)
+        loss = self.criterion(outputs, targets)
+        _, predicted = outputs.max(1)
+        self.avg_loss = (self.avg_loss * self.n_batches + loss.item()) / (self.n_batches + 1)
+        self.total += targets.size(0)
+        self.correct += predicted.eq(targets).sum().item()
+        for t, p in zip(targets, predicted):
+            self.conf_mat[int(t), p] += 1
+        self.n_batches += 1
+        outputs = outputs.clone().cpu().detach()
+        targets = targets.clone().cpu().detach()
+        self.all_outputs = np.vstack([self.all_outputs, outputs])
+        self.all_targets = np.hstack([self.all_targets, targets])
+        return loss
+
+    def classwise_outputs(self) -> List[np.ndarray]:
+        class_outputs = [np.zeros([0, self.n_classes]) for _ in range(self.n_classes)]
+        for output, label_class in zip(self.all_outputs, self.all_targets):
+            co = class_outputs[int(label_class)]
+            class_outputs[int(label_class)] = np.vstack([co, output])
+        return class_outputs
+
+    @property
+    def acc(self):
+        return 100. * self.correct / self.total
+
+    @property
+    def classwise_acc(self) -> List[float]:
+        return [self.conf_mat[i, i] / self.conf_mat[i].sum() for i in range(self.n_classes)]
+
+
+def test(net, testloader, criterion):
+    net.eval()
+    rs = RunningStats(criterion)
+    with torch.no_grad():
+        pbar = tqdm(enumerate(testloader), total=len(testloader))
+        for batch_idx, (inputs, targets) in pbar:
+            inputs, targets = inputs.to(device), targets.to(device)
+            outputs = net(inputs)
+            rs.add_output(outputs, targets)
+            pbar.set_postfix_str(
+                f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})"
+            )
+    return rs
+
+
+def load_torch_checkpoint(net: Module, chpt_path: str):
+    print('==> Loading checkpoint..')
+    checkpoint = torch.load(chpt_path)
+    net.load_state_dict(checkpoint['net'])
+    start_epoch = checkpoint['epoch']
+    return start_epoch
+
+
+def get_optimizer(net, lr):
+    return optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
+
+
+class EarlyStopping:
+    """Early stops the training if validation loss doesn't improve after a given patience."""
+
+    def __init__(self, path, patience=7, delta=0):
+        """
+        Args:
+            patience (int): How long to wait after last time validation loss improved.
+                            Default: 7
+            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
+                            Default: 0
+            path (str): Path for the checkpoint to be saved to.
+                            Default: 'checkpoint.pt'
+        """
+        self.patience = patience
+        self.counter = 0
+        self.min_loss = None
+        self.delta = delta
+        self.path = path
+
+    def __call__(self, val_loss, model, epoch):
+        if self.min_loss is None or val_loss < self.min_loss - self.delta:
+            # Improved
+            self.min_loss = val_loss
+            self.save_checkpoint(model, epoch)
+            self.counter = 0
+        else:
+            self.counter += 1
+            if self.counter >= self.patience:
+                return True
+        return False
+
+    def save_checkpoint(self, model, epoch):
+        tqdm.write('Saving..')
+        state = {
+            'net': model.state_dict(),
+            'epoch': epoch,
+        }
+        if not os.path.isdir(os.path.dirname(self.path)):
+            os.makedirs(os.path.dirname(self.path))
+        torch.save(state, self.path)
+
+
+def train_one_epoch(net, trainloader, optimizer, criterion):
+    net.train()
+    rs = RunningStats(criterion)
+    pbar = tqdm(trainloader)
+    for inputs, targets in pbar:
+        optimizer.zero_grad()
+        inputs, targets = inputs.to(device), targets.to(device)
+        outputs = net(inputs)
+        loss = rs.add_output(outputs, targets)
+        loss.backward()
+        optimizer.step()
+        pbar.set_postfix_str(
+            f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})"
+        )
+
+
+def train(net, checkpoint, output, lr):
+    start_epoch = load_torch_checkpoint(net, checkpoint) if checkpoint else 0
+    trainloader = get_cifar10_train_dataloader('./data', 128)
+    testloader = get_cifar10_test_dataloader('./data', 100)
+    criterion = CrossEntropyLoss()
+    optimizer = get_optimizer(net, lr)
+    es = EarlyStopping(output, patience=5)
+    reduce_lr = ReduceLROnPlateau(optimizer, factor=0.2, patience=3, verbose=True)
+    for epoch in range(start_epoch + 1, start_epoch + 200):
+        print('\nEpoch: %d' % epoch)
+        train_one_epoch(net, trainloader, optimizer, criterion)
+        rs = test(net, testloader, criterion)
+        if es(rs.avg_loss, net, epoch):
+            print(f"Early stopped at {epoch}")
+            break
+        reduce_lr.step(rs.avg_loss)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
+    parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
+    parser.add_argument('--resume', '-r', type=str, help='resume from checkpoint')
+    parser.add_argument(
+        '--output', '-o', type=str, required=True, help='path to save checkpoint to'
+    )
+    args = parser.parse_args()
+    train(ResNet18().to(device), args.resume, args.output, args.lr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/exp.py b/llvm/projects/pred_tuner/exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7457d5b475d53f7a6c05fcea28f8b1cc4507c93
--- /dev/null
+++ b/llvm/projects/pred_tuner/exp.py
@@ -0,0 +1,438 @@
+import abc
+import json
+import os
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Type
+
+from torch.nn import Linear, Module
+from torch.utils.data import DataLoader
+
+from models.domains import QoS, qos_stats
+from models.hpvm import HPVMConvBundle
+from models import BaselineInfo
+from toolkit import LinearEstimator, NetApproxSelector
+from utils import config_pylogger, get_knob_config_file, get_tensorrt_dir, device
+from utils.config import Config, dump_rt_format_to, load_configs_from_dir, plot_configs
+
+batch_id = "batch405"
+is_dev_time = False
+ConfigT = Dict[int, int]
+msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True)
+
+
+def get_layer_desc(path: Path) -> List[List[str]]:
+    with path.open() as f:
+        return [x.split() for x in f]
+
+
+def get_layer_desc_in_pytorch(layer_desc: List[List[str]]) -> \
+        Tuple[List[Optional[Module]], Dict[int, int]]:
+    desc = []
+    remapping = {}
+    for ext_i, vals in enumerate(layer_desc):
+        if vals and 'conv' == vals[0]:
+            remapping[ext_i] = len(remapping)
+            desc.append(HPVMConvBundle)
+        elif vals and 'dense' == vals[0]:
+            remapping[ext_i] = len(remapping)
+            desc.append(Linear)
+        else:
+            desc.append(None)
+    return desc, remapping
+
+
+def read_cost_file(layer_desc: List[List[str]], path: Path) -> List[float]:
+    with path.open() as f:
+        raw_costs = [float(x.strip()) for x in f]
+    costs = []
+    raw_cost_it = 0
+    for layer in layer_desc:
+        if 'conv' in layer or 'dense' in layer:
+            costs.append(raw_costs[raw_cost_it])
+            raw_cost_it += 1
+        else:
+            costs.append(0)
+    assert len(layer_desc) == len(costs)
+    return costs
+
+
+def read_global_knobs_speedup(path: Path):
+    knobs_speedup = {}
+    with path.open() as f:
+        for x in f:
+            toks = x.split("\t")
+            ID = int(toks[0].split(",")[1])
+            speedup = float(toks[2])
+            knobs_speedup[ID] = speedup
+    return knobs_speedup
+
+
+class Benchmark:
+    def __init__(self, json_data: dict):
+        self.json_data = json_data
+        self.model_name: str = self.model_name  # RHS from json data
+        # Use baseline configuration as seed to aid the autotuner
+        # TODO: put this as a field in benchmarks.json
+        self.use_seed = self.model_name == 'resnet50_imagenet_hpvm'
+        tensorrt = get_tensorrt_dir()
+        self.cost_file = tensorrt / self.cost_file
+        self.layer_file = tensorrt / self.layer_file
+        self.knobs_config_file = tensorrt / "autotuner/data/global_knobs.txt"
+        self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id
+        self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time else "inst_tuner")
+
+        self.layer_desc = get_layer_desc(self.layer_file)
+        self.pytorch_layer_desc, self.layer_remap = get_layer_desc_in_pytorch(self.layer_desc)
+        msg_logger.debug(f"HPVM order to neutral order remapping, model {self.model_name}: {self.layer_remap}")
+        self.layer_costs = read_cost_file(self.layer_desc, self.cost_file)
+        self.knobs_speedup = read_global_knobs_speedup(get_knob_config_file())
+
+    def set_batch_id(self, batch_id_: str = batch_id, is_dev_time_: bool = is_dev_time):
+        tensorrt = get_tensorrt_dir()
+        self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id_
+        self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time_ else "inst_tuner")
+
+    def __getattr__(self, item: str):
+        return self.json_data[item]
+
+    def translate_config(self, autotuner: ConfigT) -> ConfigT:
+        ret = {}
+        for x, v in autotuner.items():
+            if x not in self.layer_remap:
+                assert v == 11
+                continue
+            ret[self.layer_remap[x]] = v
+        return ret
+
+    def get_baseline_config(self, is_fp16: bool) -> ConfigT:
+        conf = {}
+        for layer_id, layer in enumerate(self.pytorch_layer_desc):
+            knob = 12 if layer is not None and is_fp16 else 11
+            conf[layer_id] = knob
+        return conf
+
+    def pattern_match_layer_knobs(self, module_to_knobs: Dict[Module, List[int]]) -> Dict[int, List[int]]:
+        conv_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, HPVMConvBundle)]
+        linear_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, Linear)]
+        assert len(conv_knobs) + len(linear_knobs) == len(module_to_knobs)
+        conv_knobs_idx, linear_knobs_idx = 0, 0
+        ret = {}
+        for layer_id, module_ty in enumerate(self.pytorch_layer_desc):
+            if module_ty is HPVMConvBundle:
+                # PROMISE does not apply to first layer of LeNet.
+                if self.model_name == "lenet_hpvm" and layer_id == 0:
+                    this_conv_knobs = [x for x in conv_knobs[conv_knobs_idx] if x >= 11]
+                else:
+                    this_conv_knobs = conv_knobs[conv_knobs_idx]
+                ret[layer_id] = this_conv_knobs + [11]
+                conv_knobs_idx += 1
+            elif module_ty is Linear:
+                ret[layer_id] = linear_knobs[linear_knobs_idx] + [11]
+                linear_knobs_idx += 1
+            else:
+                ret[layer_id] = [11]
+        assert conv_knobs_idx == len(conv_knobs)
+        return ret
+
+    def compute_config_cost(self, cfg: ConfigT) -> Tuple[float, float]:
+        orig_cost = 0.0
+        total_cost = 0.0
+        for layer, knob in cfg.items():
+            op_cost = self.layer_costs[layer]
+            speedup = self.knobs_speedup[knob]
+            total_cost += (op_cost * 1.0 / speedup * 1.0)
+            orig_cost += op_cost
+        speedup = (orig_cost * 1.0) / (total_cost * 1.0)
+        return total_cost, speedup
+
+    def get_n_layers(self) -> int:
+        return len(self.layer_desc)
+
+
+class ConfigMeasurer(BaselineInfo):
+    def __init__(
+            self, net: Module, val_loader: DataLoader, test_loader: DataLoader,
+            non_tensor_output: bool, qos_class: Type[QoS],
+            nas: NetApproxSelector, bench: Benchmark
+    ):
+        super().__init__(net, val_loader, test_loader, non_tensor_output, qos_class)
+        self.nas = nas
+        self.bench_translate_config = bench.translate_config
+        self.layer_remap = {k: v for k, v in enumerate(list(self.nas.net_approxes.keys()))}
+        msg_logger.debug(f"Neutral order to module scanning order remapping: {self.layer_remap}")
+        self.bench = bench
+        msg_logger.info(
+            f"Model {bench.model_name} baseline accuracy = "
+            f"{self.val_qos} ({self.test_qos} test)"
+        )
+
+    def translate_config(self, autotuner_cfg: ConfigT):
+        autotuner_cfg = self.bench_translate_config(autotuner_cfg)
+        # Translate layer index from autotuner format (0, 1, 2...)
+        # to proxy format (actual layer index)
+        cfg = {self.layer_remap[k]: v for k, v in autotuner_cfg.items() if v != 11}
+        return cfg
+
+    @classmethod
+    def init_from_bench(cls, bench: Benchmark) -> 'ConfigMeasurer':
+        bi = BaselineInfo.init_by_name(bench.model_name, device)
+        nas = NetApproxSelector(bi.baseline_net, dev_time_only=is_dev_time, ignore_fp32=not is_dev_time)
+        return cls(
+            bi.baseline_net, bi.val_loader, bi.test_loader,
+            bi.non_tensor_output, bi.qos_class, nas, bench
+        )
+
+    def proxy_estimate(self, cfg: ConfigT, proxy: LinearEstimator) -> Tuple[QoS, QoS]:
+        cfg = self.translate_config(cfg)
+        mean_acc, confident_acc = proxy.estimate(cfg)
+        return mean_acc, confident_acc
+
+    def actual_measure(
+            self, cfg: ConfigT, n_runs: int, is_test_set: bool, threshold: QoS = None
+    ) -> Tuple[QoS, Optional[float]]:
+        cfg = self.translate_config(cfg)
+        approx = self.nas.apply_approx_by_config(cfg).module
+        dataloader = self.test_loader if is_test_set else self.val_loader
+        from tqdm import trange
+        qoses = []
+        for _ in trange(n_runs, leave=None):
+            qoses.append(self.get_qos(approx, dataloader))
+        mean, _, confidence = qos_stats(qoses, threshold=threshold)
+        return mean, confidence
+
+    def get_knobs(self):
+        # Delaying computing knobs because nas can be modified externally (knobs filtered)
+        ext_layer_to_knobs = self.bench.pattern_match_layer_knobs(self.nas.get_layer_approxes())
+        msg_logger.debug(f"Getting knobs:")
+        for layer, knobs in ext_layer_to_knobs.items():
+            msg_logger.debug(f"  {layer}: {knobs}")
+        return ext_layer_to_knobs
+
+
+class PersistentState(abc.ABC):
+    def __init__(self):
+        self._substates: Dict[str, PersistentState] = {}
+
+    def __setattr__(self, name, value):
+        if isinstance(value, PersistentState):
+            self._substates[name] = value
+        super().__setattr__(name, value)
+
+    def dump(self):
+        self._dump_self()
+        for v in self._substates.values():
+            v.dump()
+
+    def load(self):
+        if self.filled():
+            return
+        try:
+            self._load_self()
+        except (ValueError, RuntimeError, FileNotFoundError) as e:
+            msg_logger.info(f"Exception {e} when loading state")
+        for k, v in self._substates.items():
+            v.load()
+
+    def filled(self):
+        return self._self_is_initialized() and all((v.filled() for v in self._substates.values()))
+
+    @abc.abstractmethod
+    def _dump_self(self):
+        pass
+
+    @abc.abstractmethod
+    def _load_self(self):
+        pass
+
+    @abc.abstractmethod
+    def _self_is_initialized(self) -> bool:
+        pass
+
+
+class PersistentConfigs(PersistentState):
+    def __init__(self, bench: Benchmark, prefix: str, baseline_acc: QoS, rt_cpu: bool, rt_gpu: bool):
+        super().__init__()
+        self._data = []
+        self._filled = False
+        self.bench = bench
+        self.prefix = prefix
+        self.baseline_qos = baseline_acc
+        self.rt_cpu_path = self.bench.result_dir / f"{prefix}_cpu.txt" if rt_cpu else None
+        self.rt_gpu_path = self.bench.result_dir / f"{prefix}_fp16.txt" if rt_gpu else None
+
+    @property
+    def config_folder(self) -> Path:
+        return self.bench.result_dir / self.prefix
+
+    @property
+    def configs(self) -> List[Config]:
+        return self._data
+
+    def _load_self(self):
+        # Try reading autotuner configs and hpvm-rt configs
+        self._data = load_configs_from_dir(self.config_folder, self.baseline_qos)
+        # If hpvm-rt is not present, dump it.
+        # TODO: check rt format integrity
+        if (
+                (self.rt_cpu_path and not self.rt_cpu_path.is_file()) or
+                (self.rt_cpu_path and not self.rt_cpu_path.is_file())
+        ):
+            self.finalize_dump()
+        self._filled = True
+
+    def _dump_self(self):
+        for conf in self._data:
+            self._dump_one(conf)
+        self.finalize_dump()
+
+    def _self_is_initialized(self) -> bool:
+        return self._filled
+
+    def _dump_one(self, config: Config):
+        if not self.config_folder.is_dir():
+            os.mkdir(self.config_folder.as_posix())
+        config_path = self.config_folder / config.fname
+        with config_path.open('w') as f:
+            f.write(config.to_tuner_format())
+
+    def append(self, config: Config):
+        self._data.append(config)
+        self._dump_one(config)
+
+    def extend(self, configs: Iterable[Config]):
+        confs = []
+        for conf in configs:
+            self._dump_one(conf)
+            confs.append(conf)
+        self._data.extend(confs)
+
+    def finalize_dump(self, with_configs: Iterable[Config] = None):
+        if with_configs is not None:
+            self.extend(with_configs)
+        self._filled = True
+        dump_rt_format_to(
+            self.bench.layer_desc, self._data, self.baseline_qos,
+            self.rt_cpu_path, self.rt_gpu_path
+        )
+
+
+class TuningTime(PersistentState):
+    def __init__(self, path: Path):
+        super().__init__()
+        self.timers = {}
+        self.path = path
+
+    def _load_self(self):
+        import re
+        with self.path.open() as f:
+            lines = f.readlines()
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            match = re.match(r'Timer ([^=]+) = ([0-9.]+) hours', line)
+            if not match:
+                raise RuntimeError(f"File {self.path} malformed")
+            self.timers[match.group(1)] = float(match.group(2))
+
+    def _dump_self(self):
+        for k, v in self.timers.items():
+            self._dump_one(k, v)
+
+    def _self_is_initialized(self) -> bool:
+        return bool(self.timers)
+
+    def _dump_one(self, key: str, value: float):
+        time_hrs = value / (60 * 60)
+        msg_logger.info(f"Timer {key} = {time_hrs:.3f} hours")
+        with self.path.open('a') as f:
+            f.write(f"Timer {key} = {time_hrs} hours\n")
+
+    def add_timer(self, key: str, value: float):
+        self.timers[key] = value
+        self._dump_one(key, value)
+
+
+class AccPair(PersistentState):
+    def __init__(self, path: Path, qos_class: Type[QoS]):
+        super().__init__()
+        self.path = path
+        self.qos_class = qos_class
+        self._data = None
+
+    @property
+    def accs(self) -> Tuple[QoS, QoS]:
+        if self._data is None:
+            raise AttributeError("Accuracy not init'ed yet")
+        return self._data
+
+    @accs.setter
+    def accs(self, value: Tuple[QoS, QoS]):
+        self._data = value
+        self._dump_self()
+
+    def _load_self(self):
+        with self.path.open() as f:
+            acc_val, acc_test = [self.qos_class.parse(s) for s in f.read().split('\n')]
+        self._data = acc_val, acc_test
+
+    def _dump_self(self):
+        with self.path.open('w') as f:
+            f.write(f"{self._data[0]}\n{self._data[1]}")
+
+    def _self_is_initialized(self) -> bool:
+        return self._data is not None
+
+
+class ExpState(PersistentState):
+    def __init__(self, bench: Benchmark, qos_class: Type[QoS], accs: Tuple[QoS, QoS] = None):
+        super().__init__()
+        self.bench = bench
+        self.baseline_accs = AccPair(bench.result_dir / 'baseline_acc.txt', qos_class)
+        self.baseline_accs.load()
+        if not self.baseline_accs.filled():
+            if accs is None:
+                raise ValueError("Provide model baseline accuracy")
+            self.baseline_accs.accs = accs
+        acc_val, acc_test = self.baseline_accs.accs
+        self.all_configs = PersistentConfigs(bench, 'all', acc_val, False, False)
+        self.filtered_configs = PersistentConfigs(bench, 'filtered', acc_val, False, False)
+        self.validated_configs = PersistentConfigs(bench, 'validated', acc_val, False, False)
+        self.tested_configs = PersistentConfigs(bench, 'tested', acc_test, False, False)
+        self.valid_configs = PersistentConfigs(bench, 'valid', acc_val, True, True)
+        self.test_configs = PersistentConfigs(bench, 'test', acc_test, True, True)
+        self.timers = TuningTime(bench.result_dir / 'tuning_time.txt')
+        super().load()
+
+    def _load_self(self):
+        pass
+
+    def _dump_self(self):
+        pass
+
+    def _self_is_initialized(self) -> bool:
+        return True
+
+    def finalize_plot(self):
+        if not self.filled():
+            raise RuntimeError("Cannot finalize before data slots are all filled")
+        plot_configs(
+            self.bench.result_dir / "all_plot.png",
+            all=self.all_configs.configs
+        )
+        plot_configs(
+            self.bench.result_dir / "validated_tested_plot.png",
+            filtered=self.filtered_configs.configs,
+            validated=self.validated_configs.configs,
+            tested=self.tested_configs.configs
+        )
+        plot_configs(
+            self.bench.result_dir / "filtered_plot.png",
+            valid=self.valid_configs.configs,
+            test=self.test_configs.configs
+        )
+
+
+with (Path(__file__).parent / 'utils/benchmarks.json').open() as f_:
+    benchmark_data = json.load(f_)
+bench_tuner_data = {k: Benchmark(v) for k, v in benchmark_data.items()}
diff --git a/llvm/projects/pred_tuner/model_params b/llvm/projects/pred_tuner/model_params
new file mode 120000
index 0000000000000000000000000000000000000000..90aaa403fdbec5110e1c02431a7df3f31fed0dbf
--- /dev/null
+++ b/llvm/projects/pred_tuner/model_params
@@ -0,0 +1 @@
+../hpvm-tensor-rt/model_params
\ No newline at end of file
diff --git a/llvm/projects/pred_tuner/models/__init__.py b/llvm/projects/pred_tuner/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..192f4b5bea17503603ba8f1208a22cea78af2897
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/__init__.py
@@ -0,0 +1,3 @@
+from .networks import networks
+from .inference import get_all_output, move_to_device_recursively, BaselineInfo
+from .domains import QoS
diff --git a/llvm/projects/pred_tuner/models/datasets/__init__.py b/llvm/projects/pred_tuner/models/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a1e35fcea0e29482abbace082f825aac6c8d608
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .hpvm import CIFAR, CIFARImage, HPVMDataset, ImageNet, MNIST
+from .torch import get_cifar10_test_dataset, get_cifar10_test_dataloader, get_cifar10_train_dataloader
diff --git a/llvm/projects/pred_tuner/models/datasets/hpvm.py b/llvm/projects/pred_tuner/models/datasets/hpvm.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa871d89d85493a0c8ad1237ed9e5e8b0b34ac49
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/datasets/hpvm.py
@@ -0,0 +1,163 @@
+import logging
+from pathlib import Path
+from typing import Iterator, List, Tuple, TypeVar
+
+import numpy as np
+import torch
+from torch.utils.data.dataset import IterableDataset
+
+from models.hpvm import read_tensor_from_file
+
+RetT = Tuple[torch.Tensor, torch.Tensor]
+T = TypeVar('T', bound='HPVMDataset')
+msg_logger = logging.getLogger()
+
+
+class HPVMDataset(IterableDataset):
+    def __init__(self, inputs: torch.Tensor, outputs: torch.Tensor):
+        self.inputs, self.outputs = inputs, outputs
+
+    @classmethod
+    def from_file(cls, *args, **kwargs):
+        pass
+
+    @property
+    def sample_input(self):
+        inputs, outputs = next(iter(self))
+        return inputs
+
+    def __len__(self) -> int:
+        return len(self.inputs)
+
+    def __getitem__(self, idx) -> RetT:
+        if idx >= len(self):
+            raise IndexError("Dataset index out of range")
+        return self.inputs[idx], self.outputs[idx]
+
+    def __iter__(self) -> Iterator[RetT]:
+        for i in range(len(self)):
+            yield self[i]
+
+
+class HPVMDNNDataset(HPVMDataset):
+    @classmethod
+    def _from_file(
+            cls, input_file: Path, labels_file: Path, is_uint8_label: bool,
+            count: int, offset: int, *item_shapes: int
+    ):
+        # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC)
+        channel_size = np.prod(np.array(item_shapes))
+        if count != -1:
+            count *= channel_size
+        offset *= channel_size
+        inputs = read_tensor_from_file(
+            input_file, -1, *item_shapes, count=count, offset=offset,
+            use_progress_bar=True
+        )
+        label_read_ty = np.int8 if is_uint8_label else np.int32
+        labels = read_tensor_from_file(
+            labels_file, -1, read_ty=label_read_ty, cast_ty=np.long,
+            count=count, offset=offset
+        )
+        if inputs.size(0) != labels.size(0):
+            raise ValueError("Input and output have different number of data points")
+        msg_logger.info(f"{inputs.shape[0]} entries loaded from dataset.")
+        return cls(inputs, labels)
+
+    @classmethod
+    def from_default_file(cls, prefix: str):
+        prefix = Path(prefix)
+        return cls.from_file(
+            Path(prefix) / 'input.bin', Path(prefix) / 'labels.bin'
+        )
+
+
+class MNIST(HPVMDNNDataset):
+    @classmethod
+    def from_file(
+            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
+    ):
+        return cls._from_file(
+            input_file, labels_file, True, count, offset, 1, 28, 28
+        )
+
+
+class CIFAR(HPVMDNNDataset):
+    @classmethod
+    def from_file(
+            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
+    ):
+        return cls._from_file(
+            input_file, labels_file, True, count, offset, 3, 32, 32
+        )
+
+
+class ImageNet(HPVMDNNDataset):
+    @classmethod
+    def from_file(
+            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
+    ):
+        return cls._from_file(
+            input_file, labels_file, False, count, offset, 3, 224, 224
+        )
+
+
+class HPVMImageDataset(HPVMDataset):
+    @classmethod
+    def _from_file(
+            cls, input_file: Path, output_file: Path,
+            count: int, offset: int, input_shape: List[int], output_shape: List[int]
+    ):
+        # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC)
+        channel_size = np.prod(np.array(input_shape))
+        if count != -1:
+            count *= channel_size
+        offset *= channel_size
+        inputs = read_tensor_from_file(
+            input_file, -1, *input_shape, count=count, offset=offset,
+            use_progress_bar=True
+        )
+        outputs = read_tensor_from_file(
+            output_file, -1, *output_shape, count=count, offset=offset,
+            use_progress_bar=True
+        )
+        print(f"(input={inputs.shape[0]}, output={outputs.shape[0]}) entries loaded from dataset.")
+        return cls(inputs, outputs)
+
+    @classmethod
+    def from_default_file(cls, prefix: str):
+        prefix = Path(prefix)
+        return cls.from_file(
+            Path(prefix) / 'input.bin', Path(prefix) / 'canny_input.bin',
+            Path(prefix) / 'labels.bin', Path(prefix) / 'output.bin'
+        )
+
+
+class CIFARImage(HPVMImageDataset):
+    def __init__(
+            self, inputs: torch.Tensor, outputs: torch.Tensor, cifar: CIFAR
+    ):
+        super().__init__(inputs, outputs)
+        self.cifar = cifar
+
+    @classmethod
+    def from_file(
+            cls, dnn_input_file: Path, image_input_file: Path,
+            labels_file: Path, output_file: Path,
+            batch_size: int = 100, count: int = -1, offset: int = 0
+    ):
+        classifier = CIFAR.from_file(dnn_input_file, labels_file)
+        dataset = HPVMImageDataset._from_file(
+            image_input_file, output_file, count, offset,
+            [3, 128, 128], [1, 128, 128]
+        )
+        return cls(dataset.inputs, dataset.outputs, classifier)
+
+    def sample(self: 'CIFARImage', ratio: float) -> 'CIFARImage':
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        if idx >= len(self):
+            raise IndexError("Dataset index out of range")
+        cifar_in, cifar_out = self.cifar[idx]
+        return (cifar_in, self.inputs[idx]), (cifar_out, self.outputs[idx])
diff --git a/llvm/projects/pred_tuner/models/datasets/torch.py b/llvm/projects/pred_tuner/models/datasets/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b07bd17c744df733158dc5d84da3f1934e7cd3c
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/datasets/torch.py
@@ -0,0 +1,37 @@
+import logging
+
+from torch.utils.data import DataLoader
+from torchvision.datasets import CIFAR10
+from torchvision.transforms import transforms
+
+msg_logger = logging.getLogger()
+
+
+def get_cifar10_train_dataloader(root: str, batchsize: int) -> DataLoader:
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    dl = DataLoader(
+        CIFAR10(root=root, train=True, download=True, transform=transform_train),
+        batch_size=batchsize, shuffle=True
+    )
+    msg_logger.info(f"{len(dl)} entries loaded from training dataset.")
+    return dl
+
+
+def get_cifar10_test_dataset(root: str) -> CIFAR10:
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    dataset = CIFAR10(root=root, train=False, download=True, transform=transform_test)
+    msg_logger.info(f"{len(dataset)} entries loaded from training dataset.")
+    return dataset
+
+
+def get_cifar10_test_dataloader(root: str, batchsize: int) -> DataLoader:
+    dl = DataLoader(get_cifar10_test_dataset(root), batch_size=batchsize)
+    return dl
diff --git a/llvm/projects/pred_tuner/models/domains/__init__.py b/llvm/projects/pred_tuner/models/domains/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe6c13a378fe61f9dee7b1c7a60950c1a58226a
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/domains/__init__.py
@@ -0,0 +1 @@
+from .qoses import QoS, Accuracy, qos_stats
diff --git a/llvm/projects/pred_tuner/models/domains/qoses.py b/llvm/projects/pred_tuner/models/domains/qoses.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e7f2eb1050f5adcc4e25d7b65100e3141ae8a
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/domains/qoses.py
@@ -0,0 +1,317 @@
+import abc
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+
+class QoS(abc.ABC):
+    @abc.abstractmethod
+    def __sub__(self, other: 'QoS') -> 'QoS':
+        pass
+
+    @abc.abstractmethod
+    def __add__(self, other: 'QoS') -> 'QoS':
+        pass
+
+    @abc.abstractmethod
+    def __truediv__(self, other: float) -> 'QoS':
+        pass
+
+    @abc.abstractmethod
+    def __lt__(self, other: 'QoS') -> bool:
+        pass
+
+    @abc.abstractmethod
+    def __eq__(self, other: 'QoS') -> bool:
+        pass
+
+    def __gt__(self, other: 'QoS') -> bool:
+        return not self <= other
+
+    def __le__(self, other: 'QoS') -> bool:
+        return self < other or self == other
+
+    def __ge__(self, other: 'QoS') -> bool:
+        return not self < other
+
+    @abc.abstractmethod
+    def __hash__(self):
+        pass
+
+    @abc.abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def to_scalar(self, relative_to=None) -> float:
+        pass
+
+    @abc.abstractmethod
+    def numpy(self) -> np.ndarray:
+        pass
+
+    @abc.abstractmethod
+    def null(self) -> 'QoS':
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse(string: str) -> 'QoS':
+        pass
+
+    @abc.abstractmethod
+    def min_positive_loss(self) -> 'QoS':
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def suggested_tuner_thresholds(baseline: 'QoS') -> List['QoS']:
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def suggested_val_threshold(baseline: 'QoS') -> 'QoS':
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def suggested_test_threshold(baseline: 'QoS') -> 'QoS':
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def from_output(output, ground_truth) -> 'QoS':
+        pass
+
+    @classmethod
+    def combine_qoses(cls, qoses: Iterable['QoS']) -> 'QoS':
+        qoses = np.array(qoses)
+        return qoses.mean()
+
+    @classmethod
+    def from_all_output(cls, outputs: List, dataloader: DataLoader) -> 'QoS':
+        if not outputs:
+            raise ValueError("Empty output has no QoS value")  # Probably can result cls.null()
+        qoses = []
+        for (_, gt_output), output in zip(dataloader, outputs):
+            qoses.append(cls.from_output(output, gt_output))
+        return cls.combine_qoses(qoses)
+
+
+class ScalarQoS(QoS, abc.ABC):
+    def __init__(self, value: float):
+        self.value = value
+
+    def __sub__(self, other: 'ScalarQoS') -> 'ScalarQoS':
+        return self.__class__(self.value - other.value)
+
+    def __add__(self, other: 'ScalarQoS') -> 'ScalarQoS':
+        return self.__class__(self.value + other.value)
+
+    def __truediv__(self, other: float):
+        return self.__class__(self.value / other)
+
+    def __lt__(self, other: 'ScalarQoS') -> bool:
+        return self.value < other.value
+
+    def __eq__(self, other: 'ScalarQoS') -> bool:
+        return self.value == other.value
+
+    def __hash__(self):
+        return hash(self.value)
+
+    def __repr__(self) -> str:
+        return repr(self.value)
+
+    def null(self) -> 'ScalarQoS':
+        return self.__class__(0.0)
+
+    def to_scalar(self, relative_to=None) -> float:
+        return self.value
+
+    def numpy(self) -> np.ndarray:
+        return np.array([self.value])
+
+    @classmethod
+    def parse(cls, string: str) -> 'ScalarQoS':
+        return cls(float(string))
+
+
+class Accuracy(ScalarQoS):
+    def __init__(self, accuracy: float):
+        super().__init__(accuracy)
+
+    def min_positive_loss(self) -> 'Accuracy':
+        return Accuracy(0.05) if self.value < 0 else self
+
+    @staticmethod
+    def suggested_tuner_thresholds(baseline: 'Accuracy') -> List['Accuracy']:
+        return [baseline - Accuracy(0.8), baseline - Accuracy(1.5), baseline - Accuracy(2.1)]
+
+    @staticmethod
+    def suggested_val_threshold(baseline: 'Accuracy') -> 'Accuracy':
+        return baseline - Accuracy(2.1)
+
+    @staticmethod
+    def suggested_test_threshold(baseline: 'Accuracy') -> 'Accuracy':
+        return baseline - Accuracy(3.0)
+
+    @staticmethod
+    def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'Accuracy':
+        ground_truth = ground_truth.to(output.device)
+        correct = output.argmax(dim=1).eq(ground_truth).sum().item()
+        acc = correct / ground_truth.shape[0]
+        return Accuracy(acc * 100)
+
+
+class PSNR(ScalarQoS):
+    artificial_max = 100
+
+    def __init__(self, psnr: float):
+        super().__init__(psnr)
+
+    def min_positive_loss(self) -> 'PSNR':
+        return PSNR(1) if self.value < 0 else self
+
+    @staticmethod
+    def suggested_tuner_thresholds(baseline: 'PSNR') -> List['PSNR']:
+        return [PSNR(30), PSNR(25), PSNR(20)]
+
+    @staticmethod
+    def suggested_val_threshold(baseline: 'PSNR') -> 'PSNR':
+        return PSNR(20)
+
+    @staticmethod
+    def suggested_test_threshold(baseline: 'PSNR') -> 'PSNR':
+        return PSNR(20)
+
+    @staticmethod
+    def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'PSNR':
+        ground_truth = ground_truth.to(output.device)
+        if ground_truth.shape[0] != 0:
+            max_i = ground_truth.max()
+            mse = torch.sum((output - ground_truth) ** 2) / output.nelement()
+            psnr = (20 * torch.log10(max_i) - 10 * torch.log10(mse)).item()
+        else:
+            psnr = PSNR.artificial_max
+        return PSNR(psnr)
+
+
+class MultiQoS(QoS, abc.ABC):
+    def __init__(self, *qoses: ScalarQoS):
+        self.qoses = qoses
+
+    def __sub__(self, other: 'MultiQoS') -> 'MultiQoS':
+        assert type(self) == type(other)
+        return self.__class__(*(x - y for x, y in zip(self.qoses, other.qoses)))
+
+    def __add__(self, other: 'MultiQoS') -> 'MultiQoS':
+        assert type(self) == type(other)
+        return self.__class__(*(x + y for x, y in zip(self.qoses, other.qoses)))
+
+    def __truediv__(self, other: int):
+        return self.__class__(*(x / other for x in self.qoses))
+
+    def __lt__(self, other: 'MultiQoS') -> bool:
+        assert type(self) == type(other)
+        return all((x < y for x, y in zip(self.qoses, other.qoses)))
+
+    def __eq__(self, other: 'MultiQoS') -> bool:
+        assert type(self) == type(other)
+        return all((x == y for x, y in zip(self.qoses, other.qoses)))
+
+    def __hash__(self):
+        return hash(self.qoses)
+
+    def __repr__(self) -> str:
+        return ','.join(repr(q) for q in self.qoses)
+
+    def null(self) -> 'MultiQoS':
+        return MultiQoS(*(q.null() for q in self.qoses))
+
+    def numpy(self) -> np.ndarray:
+        return np.array([q.to_scalar() for q in self.qoses])
+
+    def min_positive_loss(self) -> 'MultiQoS':
+        return self.__class__(*(q.min_positive_loss() for q in self.qoses))
+
+
+PairT = Tuple[torch.Tensor, torch.Tensor]
+TripleT = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+class AccuracyPSNR(MultiQoS):
+    def __init__(self, acc: Accuracy, psnr: PSNR):
+        super().__init__(acc, psnr)
+
+    def to_scalar(self, relative_to: 'AccuracyPSNR' = None) -> float:
+        acc, psnr = self.qoses
+        if relative_to is not None:
+            thres_acc, thres_psnr = relative_to.qoses
+            punishment = (-1 if acc < thres_acc else 0) + (-1 if psnr < thres_psnr else 0)
+        else:
+            punishment = 0
+        max_psnr = PSNR.artificial_max
+        normed_psnr = min(psnr.value, max_psnr) / max_psnr  # [0, 1], higher better
+        acc = acc.value / 100  # [0, 1], higher better
+        combined = (acc + normed_psnr) / 2  # [0, 1], higher better
+        assert 0 <= combined <= 1
+        return combined + punishment
+
+    @staticmethod
+    def parse(string: str) -> 'AccuracyPSNR':
+        acc, psnr = string.split(',')
+        return AccuracyPSNR(Accuracy.parse(acc), PSNR.parse(psnr))
+
+    # noinspection PyTypeChecker
+    @staticmethod
+    def suggested_tuner_thresholds(baseline: 'AccuracyPSNR') -> List['AccuracyPSNR']:
+        ret = []
+        for acc in Accuracy.suggested_tuner_thresholds(baseline.qoses[0]):
+            for psnr in PSNR.suggested_tuner_thresholds(baseline.qoses[1]):
+                ret.append(AccuracyPSNR(acc, psnr))
+        return ret
+
+    # noinspection PyTypeChecker
+    @staticmethod
+    def suggested_val_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR':
+        return AccuracyPSNR(
+            Accuracy.suggested_val_threshold(baseline.qoses[0]),
+            PSNR.suggested_val_threshold(baseline.qoses[1])
+        )
+
+    # noinspection PyTypeChecker
+    @staticmethod
+    def suggested_test_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR':
+        return AccuracyPSNR(
+            Accuracy.suggested_test_threshold(baseline.qoses[0]),
+            PSNR.suggested_test_threshold(baseline.qoses[1])
+        )
+
+    @staticmethod
+    def from_output(output: TripleT, ground_truth: PairT) -> 'AccuracyPSNR':
+        gt_labels, gt_images = ground_truth
+        labels, image_selection, images = output
+        gt_labels = gt_labels.to(labels.device)
+        gt_images = gt_images.to(images.device)
+        acc = Accuracy.from_output(labels, gt_labels)
+        gt_images = gt_images[image_selection]
+        psnr = PSNR.from_output(images, gt_images)
+        return AccuracyPSNR(acc, psnr)
+
+
+def qos_stats(qoses: List[QoS], confidence: float = None, threshold: QoS = None) -> \
+        Tuple[QoS, Optional[QoS], Optional[float]]:
+    qoses = np.array(qoses)
+    n_runs = len(qoses)
+    confidence_at_thres = np.count_nonzero(qoses > threshold) / n_runs if threshold else None
+    if confidence is None:
+        qos_at_confidence = None
+    else:
+        index = int((1 - confidence) * n_runs)
+        # Otherwise it's np.float64 and causes trouble with opentuner
+        qos_at_confidence = qoses[index]
+    mean_acc = qoses.mean()
+    return mean_acc, qos_at_confidence, confidence_at_thres
diff --git a/llvm/projects/pred_tuner/models/hpvm/__init__.py b/llvm/projects/pred_tuner/models/hpvm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..337738c0bf41002f910acfb98b9e8073ebc10052
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/__init__.py
@@ -0,0 +1,7 @@
+from .alexnet import AlexNet, AlexNet2, AlexNetImageNet
+from .alexnet_canny import AlexNet2Canny
+from .layers import HPVMConvBundle, HPVMDNN, HPVMDefaultModule, read_tensor_from_file
+from .lenet import LeNet
+from .mobilenet import MobileNet
+from .resnet import ResNet18, ResNet50
+from .vgg16 import VGG16Cifar10, VGG16Cifar100, VGG16ImageNet
diff --git a/llvm/projects/pred_tuner/models/hpvm/alexnet.py b/llvm/projects/pred_tuner/models/hpvm/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c9b6c3cae1e86ac699913b3f1d09af28c52705
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/alexnet.py
@@ -0,0 +1,49 @@
+from torch.nn import Linear, ReLU, Sequential, Tanh
+
+from .layers import HPVMConvBundle, HPVMDNN
+
+
+class AlexNet(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(3, 64, 11, Tanh, pool_size=2, padding=5),
+            HPVMConvBundle(64, 192, 5, Tanh, pool_size=2, padding=2),
+            HPVMConvBundle(192, 384, 3, Tanh, padding=1),
+            HPVMConvBundle(384, 256, 3, Tanh, padding=1),
+            HPVMConvBundle(256, 256, 3, Tanh, pool_size=2, padding=1)
+        )
+        linears = Sequential(Linear(4096, 10))
+        super().__init__(convs, linears)
+
+
+class AlexNet2(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(3, 32, 3, Tanh, padding=1),
+            HPVMConvBundle(32, 32, 3, Tanh, pool_size=2, padding=1),
+            HPVMConvBundle(32, 64, 3, Tanh, padding=1),
+            HPVMConvBundle(64, 64, 3, Tanh, pool_size=2, padding=1),
+            HPVMConvBundle(64, 128, 3, Tanh, padding=1),
+            HPVMConvBundle(128, 128, 3, Tanh, pool_size=2, padding=1)
+        )
+        linears = Sequential(Linear(2048, 10))
+        super().__init__(convs, linears)
+
+
+class AlexNetImageNet(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(3, 64, 11, ReLU, padding=2, stride=4, pool_size=3, pool_stride=2),
+            HPVMConvBundle(64, 192, 5, ReLU, padding=2, pool_size=3, pool_stride=2),
+            HPVMConvBundle(192, 384, 3, ReLU, padding=1),
+            HPVMConvBundle(384, 256, 3, ReLU, padding=1),
+            HPVMConvBundle(256, 256, 3, ReLU, padding=1, pool_size=3, pool_stride=2)
+        )
+        linears = Sequential(
+            Linear(9216, 4096),
+            ReLU(),
+            Linear(4096, 4096),
+            ReLU(),
+            Linear(4096, 1000),
+        )
+        super().__init__(convs, linears)
diff --git a/llvm/projects/pred_tuner/models/hpvm/alexnet_canny.py b/llvm/projects/pred_tuner/models/hpvm/alexnet_canny.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e610279121a5b368f4cdf64b72e0a2d6fe9289a
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/alexnet_canny.py
@@ -0,0 +1,48 @@
+from typing import Iterable, Tuple
+
+import torch
+from torch.nn import Softmax
+
+from .alexnet import AlexNet2
+from .layers import HPVMConvBundle, HPVMDefaultModule, ReduceKind, TensorReduce
+
+
+class AlexNet2Canny(HPVMDefaultModule):
+    def __init__(self, on_classes: Iterable[int]):
+        super().__init__()
+        prototype = AlexNet2()
+        self.on_classes = list(on_classes)
+        self.convs = prototype.convs
+        self.linears = prototype.linears
+        self.softmax = Softmax(1)
+        self.reduce_1 = TensorReduce(1, ReduceKind.sum)
+        self.gaussian = HPVMConvBundle(1, 1, 5, padding=2, bias=False)
+        self.sobel_x = HPVMConvBundle(1, 1, 3, padding=1, bias=False)
+        self.sobel_y = HPVMConvBundle(1, 1, 3, padding=1, bias=False)
+        self.reduce_2 = TensorReduce(2, ReduceKind.max)
+        self.reduce_3 = TensorReduce(2, ReduceKind.max)
+
+    def canny(self, images: torch.Tensor) -> torch.Tensor:
+        assert len(images.shape) == 4  # Assuming NCHW
+        grayscale = self.reduce_1(images)
+        grayscale = grayscale.unsqueeze(1)
+        denoised = self.gaussian(grayscale)
+        grad_x = self.sobel_x(denoised)
+        grad_y = self.sobel_y(denoised)
+        grad_mag = torch.sqrt(grad_x ** 2 + grad_y ** 2)
+        grad_max_1D = self.reduce_2(grad_mag)
+        grad_max = self.reduce_3(grad_max_1D)
+        grad_max = grad_max.unsqueeze(2).unsqueeze(3)
+        grad_mag_norm = grad_mag / grad_max
+        return grad_mag_norm
+
+    def forward(self, inputs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        from functools import reduce
+        from operator import ior
+        dnn_input, canny_input = inputs
+        conv_outputs = self.convs(dnn_input)
+        dnn_outputs = self.softmax(self.linears(conv_outputs.view(conv_outputs.shape[0], -1)))
+        classes = dnn_outputs.argmax(dim=1)
+        selection = reduce(ior, (classes == i for i in self.on_classes))
+        selected_inputs = canny_input[selection]
+        return dnn_outputs, selection, self.canny(selected_inputs)
diff --git a/llvm/projects/pred_tuner/models/hpvm/layers.py b/llvm/projects/pred_tuner/models/hpvm/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed66e7b1507ac4ca309de0dc0599dde9a926a8a
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/layers.py
@@ -0,0 +1,223 @@
+from enum import Enum
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, MaxPool2d, Module, Parameter, ReLU, Sequential, Softmax, \
+    Tanh
+
+
+def rsetattr(obj, attr, val):
+    pre, _, post = attr.rpartition('.')
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+
+
+def rgetattr(obj, attr, *args):
+    def _getattr(obj_, attr_):
+        return getattr(obj_, attr_, *args)
+
+    import functools
+    return functools.reduce(_getattr, attr.split('.'), obj)
+
+
+def read_tensor_from_file(
+        filename: Union[str, Path], *shape: int,
+        read_ty=np.float32, cast_ty=np.float32,
+        count: int = -1, offset: int = 0,
+        use_progress_bar: bool = False
+) -> torch.Tensor:
+    from tqdm import trange
+    block_size = 102400
+    offset = offset * read_ty().itemsize
+    mmap = np.memmap(filename, dtype=read_ty, mode='r', offset=offset)
+    raw = np.empty_like(mmap)
+    n_entries = min(mmap.shape[0], count) if count != -1 else mmap.shape[0]
+    n_blocks = int(np.ceil(n_entries / block_size))
+    iterable = trange(n_blocks) if use_progress_bar else range(n_blocks)
+    for block in iterable:
+        l, r = block * block_size, min(n_entries, (block + 1) * block_size)
+        raw[l:r] = mmap[l:r]
+    del mmap
+    if cast_ty != read_ty:
+        raw = raw.astype(cast_ty)
+    loaded_np = raw.reshape(shape)
+    return torch.from_numpy(loaded_np)
+
+
+ActivT = Optional[Callable[[], Module]]
+ArgsT = Union[List, Dict]
+RangeT = Tuple[float, float]
+RangeOT = Optional[RangeT]
+
+
+class HPVMConvBundle(Module):
+    def __init__(
+            self, in_channels: int, out_channels: int, kernel_size: int,
+            activation: ActivT = None,
+            pool_size: Optional[int] = None, pool_stride: Optional[int] = None,
+            **conv_kwargs
+    ):
+        super().__init__()
+        self.conv = Conv2d(in_channels, out_channels, kernel_size, **conv_kwargs)
+        if pool_size is None:
+            self.pooling = Sequential()
+        else:
+            pool_stride = pool_stride or pool_size
+            self.pooling = MaxPool2d(pool_size, stride=pool_stride)
+        self.activation = Sequential() if activation is None else activation()
+        self.conv_ranges_ = None
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return self.activation(self.pooling(self.conv(input_)))
+
+    def input_to_conv(self, input_: torch.Tensor) -> torch.Tensor:
+        bias = self.conv.bias
+        self.conv.bias = None
+        conv_out = self.conv(input_)
+        self.conv.bias = bias
+        return conv_out
+
+    def conv_to_output(self, conv_output: torch.Tensor) -> torch.Tensor:
+        if self.conv.bias is not None:
+            broadcast_bias = self.conv.bias.reshape(1, -1, 1, 1)
+            return self.activation(self.pooling(conv_output + broadcast_bias))
+        else:
+            return self.activation(self.pooling(conv_output))
+
+    def __getattr__(self, item):
+        if item in ('weight', 'bias'):
+            return getattr(self.conv, item)
+        return super(HPVMConvBundle, self).__getattr__(item)
+
+    def __setattr__(self, key, value):
+        if key in ('weight', 'bias'):
+            setattr(self.conv, key, value)
+        else:
+            super(HPVMConvBundle, self).__setattr__(key, value)
+
+
+class ReduceKind(Enum):
+    sum = 1
+    max = 2
+
+
+class TensorReduce(Module):
+    def __init__(self, dim: int, kind: ReduceKind, skip_ratio: float = 0.0):
+        super().__init__()
+        self.dim = dim
+        self.skip_ratio = skip_ratio
+        if kind == ReduceKind.sum:
+            self.reducer = lambda x: x.sum(dim=0)  # Because we transpose the input
+            self.normalizer = lambda x: x / (1 - self.skip_ratio)
+        elif kind == ReduceKind.max:
+            self.reducer = lambda x: x.max(dim=0)[0]
+            self.normalizer = lambda x: x
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        from math import ceil
+        inputs_t = inputs.transpose(0, self.dim)
+        if len(inputs) == 0:
+            dim_reduced = torch.zeros_like(inputs_t)[0]
+        else:
+            reduce_dim_size = inputs_t.size(0)
+            approxed_dim_size = int(ceil((1 - self.skip_ratio) * reduce_dim_size))
+            # Take a contiguous chunk and reduce over it, ignore the rest
+            dim_reduced: torch.Tensor = self.normalizer(self.reducer(inputs_t[:approxed_dim_size]))
+        return dim_reduced.unsqueeze(0).transpose(0, self.dim).squeeze(self.dim)
+
+    def change_skip_ratio(self, skip_ratio: float) -> 'TensorReduce':
+        return TensorReduce(self.dim, self.kind, skip_ratio)
+
+
+def read_quant_ranges(prefix: Path):
+    range_file = prefix / 'quant_ranges.txt'
+    if not range_file.is_file():
+        return None
+    with range_file.open() as f:
+        return [[float(field) for field in line.strip().split()] for line in f.readlines()]
+
+
+class HPVMDefaultModule(Module):
+    @staticmethod
+    def load_into_layer(
+            layer: Module, attr_name: str, filename: str, prefix: Path,
+            is_linear_weight: bool = False
+    ):
+        tensor = rgetattr(layer, attr_name)
+        if is_linear_weight:
+            n_out, n_in = tensor.shape
+            loaded = read_tensor_from_file(prefix / filename, n_in, n_out).T
+        else:
+            loaded = read_tensor_from_file(prefix / filename, *tensor.shape)
+        if type(tensor) is Parameter:
+            loaded = Parameter(loaded, requires_grad=True)
+        rsetattr(layer, attr_name, loaded)
+
+    @staticmethod
+    def install_quant_range(module: Module, values: List[float]):
+        in_min, in_max, w_min, w_max, b_min, b_max, out_min, out_max = values
+        module.conv_ranges = (in_min, in_max), (w_min, w_max), (b_min, b_max), (out_min, out_max)
+
+    def default_load_hpvm_weights(self, prefix: str):
+        # TODO: this is probably better done with help of ModuleDAG
+        prefix = Path(prefix)
+        convs, group_convs, linears, bns = [], [], [], []
+        weightless_types = AvgPool2d, MaxPool2d, ReLU, Tanh, Softmax, TensorReduce
+        container_types = (Sequential,)
+        for module in self.modules():
+            if isinstance(module, HPVMConvBundle):
+                convs.append(module)
+            elif isinstance(module, Conv2d):
+                if module.groups != 1:
+                    group_convs.append(module)
+            elif isinstance(module, Linear):
+                linears.append(module)
+            elif isinstance(module, BatchNorm2d):
+                bns.append(module)
+            elif type(module) in weightless_types:
+                pass
+            elif type(module) in container_types or len(list(module.children())) != 0:
+                continue
+            else:
+                raise RuntimeError(f"Layer type {type(module)} not understood")
+        load = self.load_into_layer
+        quant_ranges = read_quant_ranges(prefix)
+        quant_ranges_idx = 0
+        for i, conv in enumerate(convs):
+            conv: HPVMConvBundle
+            load(conv, 'weight', f"conv2d_{i + 1}_w.bin", prefix)
+            if conv.bias is not None:
+                load(conv, 'bias', f"conv2d_{i + 1}_b.bin", prefix)
+            if quant_ranges is not None:
+                self.install_quant_range(conv, quant_ranges[quant_ranges_idx])
+                quant_ranges_idx += 1
+        for i, gconv in enumerate(group_convs):
+            load(gconv, 'weight', f"depthwise_conv2d_{i + 1}_w.bin", prefix)
+            if gconv.bias is not None:
+                load(gconv, 'bias', f"depthwise_conv2d_{i + 1}_b.bin", prefix)
+        for i, bn in enumerate(bns):
+            bn: BatchNorm2d
+            load(bn, 'weight', f"batch_normalization_{i + 1}_gamma.bin", prefix)
+            load(bn, 'bias', f"batch_normalization_{i + 1}_beta.bin", prefix)
+            load(bn, 'running_mean', f"batch_normalization_{i + 1}_mean.bin", prefix)
+            load(bn, 'running_var', f"batch_normalization_{i + 1}_variance.bin", prefix)
+        for i, linear in enumerate(linears):
+            load(linear, 'weight', f"dense_{i + 1}_w.bin", prefix, True)
+            load(linear, 'bias', f"dense_{i + 1}_b.bin", prefix)
+            if quant_ranges is not None:
+                self.install_quant_range(linear, quant_ranges[quant_ranges_idx])
+                quant_ranges_idx += 1
+        assert quant_ranges is None or len(quant_ranges) == quant_ranges_idx
+
+
+class HPVMDNN(HPVMDefaultModule):
+    def __init__(self, convs: Sequential, linears: Sequential):
+        super().__init__()
+        self.convs = convs
+        self.linears = linears
+        self.softmax = Softmax(1)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        outputs = self.convs(inputs)
+        return self.softmax(self.linears(outputs.view(outputs.shape[0], -1)))
diff --git a/llvm/projects/pred_tuner/models/hpvm/lenet.py b/llvm/projects/pred_tuner/models/hpvm/lenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0802b5f78d2c73d352afe68b16df74689e9aec68
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/lenet.py
@@ -0,0 +1,16 @@
+from torch.nn import Linear, Sequential, Tanh
+
+from .layers import HPVMConvBundle, HPVMDNN
+
+
+class LeNet(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(1, 32, 5, Tanh, 2, padding=2),
+            HPVMConvBundle(32, 64, 5, Tanh, 2, padding=2)
+        )
+        linears = Sequential(
+            Linear(7 * 7 * 64, 1024), Tanh(),
+            Linear(1024, 10), Tanh()
+        )
+        super().__init__(convs, linears)
diff --git a/llvm/projects/pred_tuner/models/hpvm/mobilenet.py b/llvm/projects/pred_tuner/models/hpvm/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f48a214fc9c1d7ec52cd5a24ec0e8d82d38aaa6e
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/mobilenet.py
@@ -0,0 +1,45 @@
+from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, ReLU, Sequential
+
+from .layers import HPVMDNN, HPVMConvBundle
+
+
+def _make_seq(in_channels, out_channels, c_kernel_size, gc_stride, gc_kernel_size=3):
+    return Sequential(
+        HPVMConvBundle(
+            in_channels, out_channels, c_kernel_size,
+            bias=False, padding=(c_kernel_size - 1) // 2
+        ),
+        BatchNorm2d(out_channels, eps=0.001),
+        ReLU(),
+        Conv2d(
+            out_channels, out_channels, gc_kernel_size,
+            bias=False, stride=gc_stride, padding=(gc_kernel_size - 1) // 2, groups=out_channels
+        ),
+        BatchNorm2d(out_channels, eps=0.001),
+        ReLU()
+    )
+
+
+class MobileNet(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            _make_seq(3, 32, 3, 1),
+            _make_seq(32, 64, 1, 2),
+            _make_seq(64, 128, 1, 1),
+            _make_seq(128, 128, 1, 2),
+            _make_seq(128, 256, 1, 1),
+            _make_seq(256, 256, 1, 2),
+            _make_seq(256, 512, 1, 1),
+            _make_seq(512, 512, 1, 1),
+            _make_seq(512, 512, 1, 1),
+            _make_seq(512, 512, 1, 1),
+            _make_seq(512, 512, 1, 1),
+            _make_seq(512, 512, 1, 2),
+            _make_seq(512, 1024, 1, 1),
+            HPVMConvBundle(1024, 1024, 1, padding=0, bias=False),
+            BatchNorm2d(1024, eps=0.001),
+            ReLU(),
+            AvgPool2d(2)
+        )
+        linears = Sequential(Linear(1024, 10))
+        super().__init__(convs, linears)
diff --git a/llvm/projects/pred_tuner/models/hpvm/resnet.py b/llvm/projects/pred_tuner/models/hpvm/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc42a00001792b59b593b668f6cf4e8a5a230d9d
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/resnet.py
@@ -0,0 +1,96 @@
+from torch.nn import AvgPool2d, BatchNorm2d, Linear, Module, ReLU, Sequential
+
+from .layers import HPVMConvBundle, HPVMDNN
+
+
+class BasicBlock(Module):
+    def __init__(self, ins, outs, shortcut=False):
+        super().__init__()
+        stride = 2 if shortcut else 1
+        self.mainline = Sequential(
+            HPVMConvBundle(ins, outs, 3, ReLU, padding=1, stride=stride),
+            HPVMConvBundle(outs, outs, 3, padding=1)
+        )
+        self.relu1 = ReLU()
+        self.shortcut = HPVMConvBundle(ins, outs, 1, stride=stride) \
+            if shortcut else Sequential()
+
+    def forward(self, input_):
+        return self.relu1(self.mainline(input_) + self.shortcut(input_))
+
+
+class ResNet18(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(3, 16, 3, ReLU, padding=1),
+            BasicBlock(16, 16),
+            BasicBlock(16, 16),
+            BasicBlock(16, 16),
+            BasicBlock(16, 32, True),
+            BasicBlock(32, 32),
+            BasicBlock(32, 32),
+            BasicBlock(32, 64, True),
+            BasicBlock(64, 64),
+            BasicBlock(64, 64),
+            AvgPool2d(8)
+        )
+        linears = Sequential(Linear(64, 10))
+        super().__init__(convs, linears)
+
+
+class Bottleneck(Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.mainline = Sequential(
+            HPVMConvBundle(in_planes, planes, 1, stride=stride),
+            BatchNorm2d(planes, eps=0.001),
+            ReLU(),
+            HPVMConvBundle(planes, planes, 3, padding=1),
+            BatchNorm2d(planes, eps=0.001),
+            ReLU(),
+            HPVMConvBundle(planes, self.expansion * planes, 1),
+            BatchNorm2d(self.expansion * planes, eps=0.001)
+        )
+        self.relu1 = ReLU()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = Sequential(
+                HPVMConvBundle(in_planes, self.expansion * planes, 1, stride=stride),
+                BatchNorm2d(self.expansion * planes, eps=0.001)
+            )
+        else:
+            self.shortcut = Sequential()
+
+    def forward(self, input_):
+        return self.relu1(self.mainline(input_) + self.shortcut(input_))
+
+
+class ResNet50(HPVMDNN):
+    def __init__(self):
+        convs = Sequential(
+            HPVMConvBundle(3, 64, 7, ReLU, pool_size=3, pool_stride=2, padding=3, stride=2),
+            BatchNorm2d(64, eps=0.001),
+            Bottleneck(64, 64),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+
+            Bottleneck(256, 128, stride=2),
+            Bottleneck(512, 128),
+            Bottleneck(512, 128),
+            Bottleneck(512, 128),
+
+            Bottleneck(512, 256, stride=2),
+            Bottleneck(1024, 256),
+            Bottleneck(1024, 256),
+            Bottleneck(1024, 256),
+            Bottleneck(1024, 256),
+            Bottleneck(1024, 256),
+
+            Bottleneck(1024, 512, stride=2),
+            Bottleneck(2048, 512),
+            Bottleneck(2048, 512),
+            AvgPool2d(7)
+        )
+        linears = Sequential(Linear(2048, 1000))
+        super().__init__(convs, linears)
diff --git a/llvm/projects/pred_tuner/models/hpvm/vgg16.py b/llvm/projects/pred_tuner/models/hpvm/vgg16.py
new file mode 100644
index 0000000000000000000000000000000000000000..b31c0d47ca43118cbc1f7ad43b517d6dc02dd223
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/hpvm/vgg16.py
@@ -0,0 +1,44 @@
+from typing import Iterable
+
+from torch.nn import Linear, ReLU, Sequential
+
+from .layers import HPVMConvBundle, HPVMDNN
+
+
+class _VGG16(HPVMDNN):
+    def __init__(self, linear_inouts: Iterable[int]):
+        convs = Sequential(
+            HPVMConvBundle(3, 64, 3, ReLU, padding=1),
+            HPVMConvBundle(64, 64, 3, ReLU, 2, padding=1),
+            HPVMConvBundle(64, 128, 3, ReLU, padding=1),
+            HPVMConvBundle(128, 128, 3, ReLU, 2, padding=1),
+            HPVMConvBundle(128, 256, 3, ReLU, padding=1),
+            HPVMConvBundle(256, 256, 3, ReLU, padding=1),
+            HPVMConvBundle(256, 256, 3, ReLU, 2, padding=1),
+            HPVMConvBundle(256, 512, 3, ReLU, padding=1),
+            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
+            HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1),
+            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
+            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
+            HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1)
+        )
+        linear_layers = [Linear(in_, out) for in_, out in zip(linear_inouts, linear_inouts[1:])]
+        linear_relus = [ReLU() for _ in range(2 * len(linear_layers) - 1)]
+        linear_relus[::2] = linear_layers
+        linears = Sequential(*linear_relus)
+        super().__init__(convs, linears)
+
+
+class VGG16Cifar10(_VGG16):
+    def __init__(self):
+        super().__init__([512, 512, 10])
+
+
+class VGG16Cifar100(_VGG16):
+    def __init__(self):
+        super().__init__([512, 512, 100])
+
+
+class VGG16ImageNet(_VGG16):
+    def __init__(self):
+        super().__init__([25088, 4096, 4096, 1000])
diff --git a/llvm/projects/pred_tuner/models/inference.py b/llvm/projects/pred_tuner/models/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d797e9e605d8c3363d20f09fb52eb4a78195a9ac
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/inference.py
@@ -0,0 +1,99 @@
+import logging
+from typing import Type, Union
+
+import torch
+from torch.nn import Module
+from torch.utils.data import DataLoader, IterableDataset, Subset
+
+from .domains import QoS
+from .hpvm import HPVMDNN, HPVMDefaultModule
+from .networks import networks
+
+msg_logger = logging.getLogger(__name__)
+
+
+def move_to_device_recursively(data: object, device_: Union[torch.device, str]):
+    if isinstance(data, torch.Tensor):
+        return data.to(device_)
+    if not hasattr(data, '__dict__'):
+        if isinstance(data, list):
+            return [move_to_device_recursively(x, device_) for x in data]
+        elif isinstance(data, tuple):
+            return tuple([move_to_device_recursively(x, device_) for x in data])
+        else:
+            raise RuntimeError(f"Don't know how to manipulate {type(data)}")
+    for key, value in data.__dict__.items():
+        data.__dict__[key] = move_to_device_recursively(value, device_)
+    return data
+
+
+def _infer_net_device(net: Module):
+    return next(iter(net.parameters())).device
+
+
+def get_all_output(net: Module, dataloader: DataLoader):
+    outputs = []
+    device = _infer_net_device(net)
+    with torch.no_grad():
+        for inputs, targets in dataloader:
+            inputs = move_to_device_recursively(inputs, device)
+            outputs.append(net(inputs))
+    return outputs
+
+
+def load_torch_checkpoint(net: Module, chpt_path: str):
+    msg_logger.info('==> Loading checkpoint..')
+    checkpoint = torch.load(chpt_path)
+    net.load_state_dict(checkpoint.pop('net'))
+    return checkpoint
+
+
+class BaselineInfo:
+    def __init__(
+            self, net: Module, val_loader: DataLoader, test_loader: DataLoader,
+            non_tensor_output: bool, qos_class: Type[QoS]
+    ):
+        self.baseline_net = net
+        self.val_loader = val_loader
+        self.test_loader = test_loader
+        self.non_tensor_output = non_tensor_output
+        self.qos_class = qos_class
+        self.val_qos = self.get_qos(net, val_loader)
+        self.test_qos = self.get_qos(net, test_loader)
+
+    def get_qos(self, net: Module, dataloader: DataLoader):
+        return self.qos_class.from_all_output(get_all_output(net, dataloader), dataloader)
+
+    @staticmethod
+    def _split_dataset(dataset: IterableDataset, split_at: int):
+        return Subset(dataset, torch.arange(0, split_at)), \
+               Subset(dataset, torch.arange(split_at, len(dataset)))
+
+    @classmethod
+    def init_by_name(cls, model_name: str, device) -> 'BaselineInfo':
+        msg_logger.info('==> Building model..')
+        network_factory, dataset_factory, batchsize, prefix, qos_class = networks[model_name]
+        net = network_factory()
+        # 1. Load network weights
+        msg_logger.info('==> Loading checkpoint..')
+        if isinstance(net, HPVMDefaultModule):
+            net.default_load_hpvm_weights(prefix)
+        else:
+            load_torch_checkpoint(net, prefix)
+        net = net.eval().to(device)
+        # 2. Load dataset
+        msg_logger.info('==> Loading dataset...')
+        if isinstance(net, HPVMDNN):
+            dataset = dataset_factory(prefix)
+            non_tensor_output = False
+        elif isinstance(net, HPVMDefaultModule):  # Is image benchmark
+            dataset = dataset_factory(prefix)
+            non_tensor_output = True
+        else:
+            dataset = dataset_factory('./data')
+            non_tensor_output = False
+        # 3. Split dataset
+        test_set, val_set = cls._split_dataset(dataset, 5000)
+        test_loader = DataLoader(test_set, batch_size=batchsize)
+        val_loader = DataLoader(val_set, batch_size=batchsize)
+        return cls(net, val_loader, test_loader, non_tensor_output, qos_class)
diff --git a/llvm/projects/pred_tuner/models/networks.py b/llvm/projects/pred_tuner/models/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5611bcb3e681c618cc5f8d8d188e9afc2fb5687
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/networks.py
@@ -0,0 +1,54 @@
+from . import hpvm
+from .datasets import CIFAR, CIFARImage, MNIST, get_cifar10_test_dataset
+from .domains import Accuracy
+from .domains.qoses import AccuracyPSNR
+from .torch import ResNet18, VGG
+
+
+networks = {
+    'lenet_hpvm': (
+        hpvm.LeNet, MNIST.from_default_file, 5000,
+        'model_params/lenet_mnist', Accuracy
+    ),
+    'alexnet_hpvm': (
+        hpvm.AlexNet, CIFAR.from_default_file, 2000,
+        'model_params/alexnet_cifar10', Accuracy
+    ),
+    'alexnet2_hpvm': (
+        hpvm.AlexNet2, CIFAR.from_default_file, 2000,
+        'model_params/alexnet2_cifar10', Accuracy
+    ),
+    'vgg16_cifar10_hpvm': (
+        hpvm.VGG16Cifar10, CIFAR.from_default_file, 500,
+        'model_params/vgg16_cifar10', Accuracy
+    ),
+    'vgg16_cifar100_hpvm': (
+        hpvm.VGG16Cifar100, CIFAR.from_default_file, 500,
+        'model_params/vgg16_cifar100', Accuracy
+    ),
+    'mobilenet_hpvm': (
+        hpvm.MobileNet, CIFAR.from_default_file, 1000,
+        'model_params/mobilenet', Accuracy
+    ),
+    'resnet18_hpvm': (
+        hpvm.ResNet18, CIFAR.from_default_file, 1000,
+        'model_params/resnet18_cifar10', Accuracy
+    ),
+    'alexnet_imagenet_hpvm': (
+        hpvm.AlexNetImageNet, CIFAR.from_default_file, 100,
+        'model_params/alexnet_imagenet', Accuracy
+    ),
+    'vgg16_imagenet_hpvm': (
+        hpvm.VGG16ImageNet, CIFAR.from_default_file, 50,
+        'model_params/vgg16_imagenet', Accuracy
+    ),
+    'resnet50_imagenet_hpvm': (
+        hpvm.ResNet50, CIFAR.from_default_file, 25,
+        'model_params/resnet50_imagenet', Accuracy
+    ),
+    'alexnet2_canny_hpvm': (
+        lambda: hpvm.AlexNet2Canny(on_classes=[1, 2, 3, 4, 5]),
+        CIFARImage.from_default_file, 50,
+        'model_params/alexnet2_canny', AccuracyPSNR
+    )
+}
diff --git a/llvm/projects/pred_tuner/models/torch/__init__.py b/llvm/projects/pred_tuner/models/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff98ce114a9f0797ed08e74db1184d727f94f2e
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/__init__.py
@@ -0,0 +1,15 @@
+from .vgg import *
+from .dpn import *
+from .lenet import *
+from .senet import *
+from .pnasnet import *
+from .densenet import *
+from .googlenet import *
+from .shufflenet import *
+from .shufflenetv2 import *
+from .resnet import *
+from .resnext import *
+from .preact_resnet import *
+from .mobilenet import *
+from .mobilenetv2 import *
+from .efficientnet import *
diff --git a/llvm/projects/pred_tuner/models/torch/densenet.py b/llvm/projects/pred_tuner/models/torch/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..47ebbbe08e40503d6785711acd8bd7dd2cdba768
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/densenet.py
@@ -0,0 +1,107 @@
+'''DenseNet in PyTorch.'''
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, growth_rate):
+        super(Bottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(4*growth_rate)
+        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
+
+    def forward(self, x):
+        out = self.conv1(F.relu(self.bn1(x)))
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = torch.cat([out,x], 1)
+        return out
+
+
+class Transition(nn.Module):
+    def __init__(self, in_planes, out_planes):
+        super(Transition, self).__init__()
+        self.bn = nn.BatchNorm2d(in_planes)
+        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        out = self.conv(F.relu(self.bn(x)))
+        out = F.avg_pool2d(out, 2)
+        return out
+
+
+class DenseNet(nn.Module):
+    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
+        super(DenseNet, self).__init__()
+        self.growth_rate = growth_rate
+
+        num_planes = 2*growth_rate
+        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
+
+        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
+        num_planes += nblocks[0]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans1 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
+        num_planes += nblocks[1]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans2 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
+        num_planes += nblocks[2]*growth_rate
+        out_planes = int(math.floor(num_planes*reduction))
+        self.trans3 = Transition(num_planes, out_planes)
+        num_planes = out_planes
+
+        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
+        num_planes += nblocks[3]*growth_rate
+
+        self.bn = nn.BatchNorm2d(num_planes)
+        self.linear = nn.Linear(num_planes, num_classes)
+
+    def _make_dense_layers(self, block, in_planes, nblock):
+        layers = []
+        for i in range(nblock):
+            layers.append(block(in_planes, self.growth_rate))
+            in_planes += self.growth_rate
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.trans1(self.dense1(out))
+        out = self.trans2(self.dense2(out))
+        out = self.trans3(self.dense3(out))
+        out = self.dense4(out)
+        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+def DenseNet121():
+    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)
+
+def DenseNet169():
+    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)
+
+def DenseNet201():
+    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)
+
+def DenseNet161():
+    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)
+
+def densenet_cifar():
+    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
+
+def test():
+    net = densenet_cifar()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y)
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/dpn.py b/llvm/projects/pred_tuner/models/torch/dpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d334367fcc9876b104a94b7ae333362ea0a64469
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/dpn.py
@@ -0,0 +1,98 @@
+'''Dual Path Networks in PyTorch.'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
+        super(Bottleneck, self).__init__()
+        self.out_planes = out_planes
+        self.dense_depth = dense_depth
+
+        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
+        self.bn2 = nn.BatchNorm2d(in_planes)
+        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)
+
+        self.shortcut = nn.Sequential()
+        if first_layer:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_planes+dense_depth)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        x = self.shortcut(x)
+        d = self.out_planes
+        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
+        out = F.relu(out)
+        return out
+
+
+class DPN(nn.Module):
+    def __init__(self, cfg):
+        super(DPN, self).__init__()
+        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
+        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.last_planes = 64
+        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
+        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
+        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
+        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
+        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
+
+    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for i,stride in enumerate(strides):
+            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
+            self.last_planes = out_planes + (i+2) * dense_depth
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def DPN26():
+    cfg = {
+        'in_planes': (96,192,384,768),
+        'out_planes': (256,512,1024,2048),
+        'num_blocks': (2,2,2,2),
+        'dense_depth': (16,32,24,128)
+    }
+    return DPN(cfg)
+
+def DPN92():
+    cfg = {
+        'in_planes': (96,192,384,768),
+        'out_planes': (256,512,1024,2048),
+        'num_blocks': (3,4,20,3),
+        'dense_depth': (16,32,24,128)
+    }
+    return DPN(cfg)
+
+
+def test():
+    net = DPN92()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y)
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/efficientnet.py b/llvm/projects/pred_tuner/models/torch/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a10a97468b5a505d5ea4bf1b5b53859dacef233
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/efficientnet.py
@@ -0,0 +1,99 @@
+'''EfficientNet in PyTorch.
+
+Paper: "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks".
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Block(nn.Module):
+    '''expand + depthwise + pointwise + squeeze-excitation'''
+
+    def __init__(self, in_planes, out_planes, expansion, stride):
+        super(Block, self).__init__()
+        self.stride = stride
+
+        planes = expansion * in_planes
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=stride, padding=1, groups=planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 1 and in_planes != out_planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, out_planes, kernel_size=1,
+                          stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_planes),
+            )
+
+        # SE layers
+        self.fc1 = nn.Conv2d(out_planes, out_planes//16, kernel_size=1)
+        self.fc2 = nn.Conv2d(out_planes//16, out_planes, kernel_size=1)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        shortcut = self.shortcut(x) if self.stride == 1 else out
+        # Squeeze-Excitation
+        w = F.avg_pool2d(out, out.size(2))
+        w = F.relu(self.fc1(w))
+        w = self.fc2(w).sigmoid()
+        out = out * w + shortcut
+        return out
+
+
+class EfficientNet(nn.Module):
+    def __init__(self, cfg, num_classes=10):
+        super(EfficientNet, self).__init__()
+        self.cfg = cfg
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.layers = self._make_layers(in_planes=32)
+        self.linear = nn.Linear(cfg[-1][1], num_classes)
+
+    def _make_layers(self, in_planes):
+        layers = []
+        for expansion, out_planes, num_blocks, stride in self.cfg:
+            strides = [stride] + [1]*(num_blocks-1)
+            for stride in strides:
+                layers.append(Block(in_planes, out_planes, expansion, stride))
+                in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layers(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def EfficientNetB0():
+    # (expansion, out_planes, num_blocks, stride)
+    cfg = [(1,  16, 1, 2),
+           (6,  24, 2, 1),
+           (6,  40, 2, 2),
+           (6,  80, 3, 2),
+           (6, 112, 3, 1),
+           (6, 192, 4, 2),
+           (6, 320, 1, 2)]
+    return EfficientNet(cfg)
+
+
+def test():
+    net = EfficientNetB0()
+    x = torch.randn(2, 3, 32, 32)
+    y = net(x)
+    print(y.shape)
+
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/googlenet.py b/llvm/projects/pred_tuner/models/torch/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed8f6eb236d966f206f457e1637e11fecd44408
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/googlenet.py
@@ -0,0 +1,106 @@
+"""GoogLeNet with PyTorch."""
+import torch
+import torch.nn as nn
+
+
+class Inception(nn.Module):
+    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
+        super(Inception, self).__init__()
+        # 1x1 conv branch
+        self.b1 = nn.Sequential(
+            nn.Conv2d(in_planes, n1x1, kernel_size=1),
+            nn.BatchNorm2d(n1x1),
+            nn.ReLU(True),
+        )
+
+        # 1x1 conv -> 3x3 conv branch
+        self.b2 = nn.Sequential(
+            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
+            nn.BatchNorm2d(n3x3red),
+            nn.ReLU(True),
+            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n3x3),
+            nn.ReLU(True),
+        )
+
+        # 1x1 conv -> 5x5 conv branch
+        self.b3 = nn.Sequential(
+            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
+            nn.BatchNorm2d(n5x5red),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5),
+            nn.ReLU(True),
+            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
+            nn.BatchNorm2d(n5x5),
+            nn.ReLU(True),
+        )
+
+        # 3x3 pool -> 1x1 conv branch
+        self.b4 = nn.Sequential(
+            nn.MaxPool2d(3, stride=1, padding=1),
+            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
+            nn.BatchNorm2d(pool_planes),
+            nn.ReLU(True),
+        )
+
+    def forward(self, x):
+        y1 = self.b1(x)
+        y2 = self.b2(x)
+        y3 = self.b3(x)
+        y4 = self.b4(x)
+        return torch.cat([y1, y2, y3, y4], 1)
+
+
+class GoogLeNet(nn.Module):
+    def __init__(self):
+        super(GoogLeNet, self).__init__()
+        self.pre_layers = nn.Sequential(
+            nn.Conv2d(3, 192, kernel_size=3, padding=1),
+            nn.BatchNorm2d(192),
+            nn.ReLU(True),
+        )
+
+        self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
+        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
+
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+
+        self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
+        self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
+        self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
+        self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
+        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
+
+        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
+        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
+
+        self.avgpool = nn.AvgPool2d(8, stride=1)
+        self.linear = nn.Linear(1024, 10)
+
+    def forward(self, x):
+        out = self.pre_layers(x)
+        out = self.a3(out)
+        out = self.b3(out)
+        out = self.maxpool(out)
+        out = self.a4(out)
+        out = self.b4(out)
+        out = self.c4(out)
+        out = self.d4(out)
+        out = self.e4(out)
+        out = self.maxpool(out)
+        out = self.a5(out)
+        out = self.b5(out)
+        out = self.avgpool(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def test():
+    net = GoogLeNet()
+    x = torch.randn(1, 3, 32, 32)
+    y = net(x)
+    print(y.size())
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/lenet.py b/llvm/projects/pred_tuner/models/torch/lenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d657b7482a75a3058e5795f367dfbb32e948b9d5
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/lenet.py
@@ -0,0 +1,23 @@
+'''LeNet in PyTorch.'''
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1   = nn.Linear(16*5*5, 120)
+        self.fc2   = nn.Linear(120, 84)
+        self.fc3   = nn.Linear(84, 10)
+
+    def forward(self, x):
+        out = F.relu(self.conv1(x))
+        out = F.max_pool2d(out, 2)
+        out = F.relu(self.conv2(out))
+        out = F.max_pool2d(out, 2)
+        out = out.view(out.size(0), -1)
+        out = F.relu(self.fc1(out))
+        out = F.relu(self.fc2(out))
+        out = self.fc3(out)
+        return out
diff --git a/llvm/projects/pred_tuner/models/torch/mobilenet.py b/llvm/projects/pred_tuner/models/torch/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..497ef1e867d2a597b9b444ebc7a6f30cd5219777
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/mobilenet.py
@@ -0,0 +1,61 @@
+'''MobileNet in PyTorch.
+
+See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Block(nn.Module):
+    '''Depthwise conv + Pointwise conv'''
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(Block, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        return out
+
+
+class MobileNet(nn.Module):
+    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
+    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]
+
+    def __init__(self, num_classes=10):
+        super(MobileNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.layers = self._make_layers(in_planes=32)
+        self.linear = nn.Linear(1024, num_classes)
+
+    def _make_layers(self, in_planes):
+        layers = []
+        for x in self.cfg:
+            out_planes = x if isinstance(x, int) else x[0]
+            stride = 1 if isinstance(x, int) else x[1]
+            layers.append(Block(in_planes, out_planes, stride))
+            in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layers(out)
+        out = F.avg_pool2d(out, 2)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def test():
+    net = MobileNet()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y.size())
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/mobilenetv2.py b/llvm/projects/pred_tuner/models/torch/mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e5823ef4426ceceae462782a267f89b1ecbc76
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/mobilenetv2.py
@@ -0,0 +1,86 @@
+'''MobileNetV2 in PyTorch.
+
+See the paper "Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Block(nn.Module):
+    '''expand + depthwise + pointwise'''
+    def __init__(self, in_planes, out_planes, expansion, stride):
+        super(Block, self).__init__()
+        self.stride = stride
+
+        planes = expansion * in_planes
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 1 and in_planes != out_planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_planes),
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out = out + self.shortcut(x) if self.stride==1 else out
+        return out
+
+
+class MobileNetV2(nn.Module):
+    # (expansion, out_planes, num_blocks, stride)
+    cfg = [(1,  16, 1, 1),
+           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
+           (6,  32, 3, 2),
+           (6,  64, 4, 2),
+           (6,  96, 3, 1),
+           (6, 160, 3, 2),
+           (6, 320, 1, 1)]
+
+    def __init__(self, num_classes=10):
+        super(MobileNetV2, self).__init__()
+        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.layers = self._make_layers(in_planes=32)
+        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(1280)
+        self.linear = nn.Linear(1280, num_classes)
+
+    def _make_layers(self, in_planes):
+        layers = []
+        for expansion, out_planes, num_blocks, stride in self.cfg:
+            strides = [stride] + [1]*(num_blocks-1)
+            for stride in strides:
+                layers.append(Block(in_planes, out_planes, expansion, stride))
+                in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layers(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def test():
+    net = MobileNetV2()
+    x = torch.randn(2,3,32,32)
+    y = net(x)
+    print(y.size())
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/pnasnet.py b/llvm/projects/pred_tuner/models/torch/pnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..de8c4d51f2667f84eab86f29be9a00ea7d0ad1c3
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/pnasnet.py
@@ -0,0 +1,125 @@
+'''PNASNet in PyTorch.
+
+Paper: Progressive Neural Architecture Search
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SepConv(nn.Module):
+    '''Separable Convolution.'''
+    def __init__(self, in_planes, out_planes, kernel_size, stride):
+        super(SepConv, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, out_planes,
+                               kernel_size, stride,
+                               padding=(kernel_size-1)//2,
+                               bias=False, groups=in_planes)
+        self.bn1 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        return self.bn1(self.conv1(x))
+
+
+class CellA(nn.Module):
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(CellA, self).__init__()
+        self.stride = stride
+        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
+        if stride==2:
+            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+            self.bn1 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        y1 = self.sep_conv1(x)
+        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
+        if self.stride==2:
+            y2 = self.bn1(self.conv1(y2))
+        return F.relu(y1+y2)
+
+class CellB(nn.Module):
+    def __init__(self, in_planes, out_planes, stride=1):
+        super(CellB, self).__init__()
+        self.stride = stride
+        # Left branch
+        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
+        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
+        # Right branch
+        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
+        if stride==2:
+            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+            self.bn1 = nn.BatchNorm2d(out_planes)
+        # Reduce channels
+        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_planes)
+
+    def forward(self, x):
+        # Left branch
+        y1 = self.sep_conv1(x)
+        y2 = self.sep_conv2(x)
+        # Right branch
+        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
+        if self.stride==2:
+            y3 = self.bn1(self.conv1(y3))
+        y4 = self.sep_conv3(x)
+        # Concat & reduce channels
+        b1 = F.relu(y1+y2)
+        b2 = F.relu(y3+y4)
+        y = torch.cat([b1,b2], 1)
+        return F.relu(self.bn2(self.conv2(y)))
+
+class PNASNet(nn.Module):
+    def __init__(self, cell_type, num_cells, num_planes):
+        super(PNASNet, self).__init__()
+        self.in_planes = num_planes
+        self.cell_type = cell_type
+
+        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(num_planes)
+
+        self.layer1 = self._make_layer(num_planes, num_cells=6)
+        self.layer2 = self._downsample(num_planes*2)
+        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
+        self.layer4 = self._downsample(num_planes*4)
+        self.layer5 = self._make_layer(num_planes*4, num_cells=6)
+
+        self.linear = nn.Linear(num_planes*4, 10)
+
+    def _make_layer(self, planes, num_cells):
+        layers = []
+        for _ in range(num_cells):
+            layers.append(self.cell_type(self.in_planes, planes, stride=1))
+            self.in_planes = planes
+        return nn.Sequential(*layers)
+
+    def _downsample(self, planes):
+        layer = self.cell_type(self.in_planes, planes, stride=2)
+        self.in_planes = planes
+        return layer
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = self.layer5(out)
+        out = F.avg_pool2d(out, 8)
+        out = self.linear(out.view(out.size(0), -1))
+        return out
+
+
+def PNASNetA():
+    return PNASNet(CellA, num_cells=6, num_planes=44)
+
+def PNASNetB():
+    return PNASNet(CellB, num_cells=6, num_planes=32)
+
+
+def test():
+    net = PNASNetB()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y)
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/preact_resnet.py b/llvm/projects/pred_tuner/models/torch/preact_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb1bc313c011d2ee650c353c515e2cd404503f3
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/preact_resnet.py
@@ -0,0 +1,118 @@
+'''Pre-activation ResNet in PyTorch.
+
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PreActBlock(nn.Module):
+    '''Pre-activation version of the BasicBlock.'''
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out += shortcut
+        return out
+
+
+class PreActBottleneck(nn.Module):
+    '''Pre-activation version of the original Bottleneck module.'''
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = self.conv3(F.relu(self.bn3(out)))
+        out += shortcut
+        return out
+
+
+class PreActResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(PreActResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def PreActResNet18():
+    return PreActResNet(PreActBlock, [2,2,2,2])
+
+def PreActResNet34():
+    return PreActResNet(PreActBlock, [3,4,6,3])
+
+def PreActResNet50():
+    return PreActResNet(PreActBottleneck, [3,4,6,3])
+
+def PreActResNet101():
+    return PreActResNet(PreActBottleneck, [3,4,23,3])
+
+def PreActResNet152():
+    return PreActResNet(PreActBottleneck, [3,8,36,3])
+
+
+def test():
+    net = PreActResNet18()
+    y = net((torch.randn(1,3,32,32)))
+    print(y.size())
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/resnet.py b/llvm/projects/pred_tuner/models/torch/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7c03ed134293e2a6a1dd373556e83978ef3d560
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/resnet.py
@@ -0,0 +1,122 @@
+"""ResNet in PyTorch.
+
+For Pre-activation ResNet, see 'preact_resnet.py'.
+
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from models.hpvm import HPVMConvBundle
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU()
+        self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes)
+            )
+        self.relu2 = nn.ReLU()
+
+    def forward(self, x):
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = self.relu2(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = HPVMConvBundle(planes, self.expansion * planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion * planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = HPVMConvBundle(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU()
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.avg_pool2d = nn.AvgPool2d(4)
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = self.avg_pool2d(out)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ResNet18():
+    return ResNet(BasicBlock, [2, 2, 2, 2])
+
+
+def ResNet34():
+    return ResNet(BasicBlock, [3, 4, 6, 3])
+
+
+def ResNet50():
+    return ResNet(Bottleneck, [3, 4, 6, 3])
+
+
+def ResNet101():
+    return ResNet(Bottleneck, [3, 4, 23, 3])
+
+
+def ResNet152():
+    return ResNet(Bottleneck, [3, 8, 36, 3])
diff --git a/llvm/projects/pred_tuner/models/torch/resnext.py b/llvm/projects/pred_tuner/models/torch/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a08f3e7d9fdf3b65aad5b773d4d113c6b796423
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/resnext.py
@@ -0,0 +1,95 @@
+'''ResNeXt in PyTorch.
+
+See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Block(nn.Module):
+    '''Grouped convolution block.'''
+    expansion = 2
+
+    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
+        super(Block, self).__init__()
+        group_width = cardinality * bottleneck_width
+        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(group_width)
+        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
+        self.bn2 = nn.BatchNorm2d(group_width)
+        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*group_width:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*group_width)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNeXt(nn.Module):
+    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
+        super(ResNeXt, self).__init__()
+        self.cardinality = cardinality
+        self.bottleneck_width = bottleneck_width
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(num_blocks[0], 1)
+        self.layer2 = self._make_layer(num_blocks[1], 2)
+        self.layer3 = self._make_layer(num_blocks[2], 2)
+        # self.layer4 = self._make_layer(num_blocks[3], 2)
+        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
+
+    def _make_layer(self, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
+            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
+        # Increase bottleneck_width by 2 after each stage.
+        self.bottleneck_width *= 2
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        # out = self.layer4(out)
+        out = F.avg_pool2d(out, 8)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ResNeXt29_2x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)
+
+def ResNeXt29_4x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)
+
+def ResNeXt29_8x64d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)
+
+def ResNeXt29_32x4d():
+    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)
+
+def test_resnext():
+    net = ResNeXt29_2x64d()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y.size())
+
+# test_resnext()
diff --git a/llvm/projects/pred_tuner/models/torch/senet.py b/llvm/projects/pred_tuner/models/torch/senet.py
new file mode 100644
index 0000000000000000000000000000000000000000..98bfa0ca51dcd07b586432c9f9460be8d1f0b745
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/senet.py
@@ -0,0 +1,121 @@
+'''SENet in PyTorch.
+
+SENet is the winner of ImageNet-2017. The paper is not released yet.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes)
+            )
+
+        # SE layers
+        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
+        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+
+        # Squeeze
+        w = F.avg_pool2d(out, out.size(2))
+        w = F.relu(self.fc1(w))
+        w = F.sigmoid(self.fc2(w))
+        # Excitation
+        out = out * w  # New broadcasting feature from v0.2!
+
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class PreActBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+        # SE layers
+        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
+        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+
+        # Squeeze
+        w = F.avg_pool2d(out, out.size(2))
+        w = F.relu(self.fc1(w))
+        w = F.sigmoid(self.fc2(w))
+        # Excitation
+        out = out * w
+
+        out += shortcut
+        return out
+
+
+class SENet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(SENet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def SENet18():
+    return SENet(PreActBlock, [2,2,2,2])
+
+
+def test():
+    net = SENet18()
+    y = net(torch.randn(1,3,32,32))
+    print(y.size())
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/shufflenet.py b/llvm/projects/pred_tuner/models/torch/shufflenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..acff6f78266c55bb93f5b12a6306a5647ebb0769
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/shufflenet.py
@@ -0,0 +1,109 @@
+'''ShuffleNet in PyTorch.
+
+See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ShuffleBlock(nn.Module):
+    def __init__(self, groups):
+        super(ShuffleBlock, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+        N,C,H,W = x.size()
+        g = self.groups
+        return x.view(N,g,C//g,H,W).permute(0,2,1,3,4).reshape(N,C,H,W)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, out_planes, stride, groups):
+        super(Bottleneck, self).__init__()
+        self.stride = stride
+
+        mid_planes = out_planes/4
+        g = 1 if in_planes==24 else groups
+        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_planes)
+        self.shuffle1 = ShuffleBlock(groups=g)
+        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_planes)
+        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 2:
+            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.shuffle1(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        res = self.shortcut(x)
+        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
+        return out
+
+
+class ShuffleNet(nn.Module):
+    def __init__(self, cfg):
+        super(ShuffleNet, self).__init__()
+        out_planes = cfg['out_planes']
+        num_blocks = cfg['num_blocks']
+        groups = cfg['groups']
+
+        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(24)
+        self.in_planes = 24
+        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
+        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
+        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
+        self.linear = nn.Linear(out_planes[2], 10)
+
+    def _make_layer(self, out_planes, num_blocks, groups):
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            cat_planes = self.in_planes if i == 0 else 0
+            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
+            self.in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ShuffleNetG2():
+    cfg = {
+        'out_planes': [200,400,800],
+        'num_blocks': [4,8,4],
+        'groups': 2
+    }
+    return ShuffleNet(cfg)
+
+def ShuffleNetG3():
+    cfg = {
+        'out_planes': [240,480,960],
+        'num_blocks': [4,8,4],
+        'groups': 3
+    }
+    return ShuffleNet(cfg)
+
+
+def test():
+    net = ShuffleNetG2()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y)
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/shufflenetv2.py b/llvm/projects/pred_tuner/models/torch/shufflenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefcda32059f0b8575148098c78ff5d84effd388
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/shufflenetv2.py
@@ -0,0 +1,162 @@
+'''ShuffleNetV2 in PyTorch.
+
+See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ShuffleBlock(nn.Module):
+    def __init__(self, groups=2):
+        super(ShuffleBlock, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+        N, C, H, W = x.size()
+        g = self.groups
+        return x.view(N, g, C//g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)
+
+
+class SplitBlock(nn.Module):
+    def __init__(self, ratio):
+        super(SplitBlock, self).__init__()
+        self.ratio = ratio
+
+    def forward(self, x):
+        c = int(x.size(1) * self.ratio)
+        return x[:, :c, :, :], x[:, c:, :, :]
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_channels, split_ratio=0.5):
+        super(BasicBlock, self).__init__()
+        self.split = SplitBlock(split_ratio)
+        in_channels = int(in_channels * split_ratio)
+        self.conv1 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv2 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)
+        self.bn2 = nn.BatchNorm2d(in_channels)
+        self.conv3 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(in_channels)
+        self.shuffle = ShuffleBlock()
+
+    def forward(self, x):
+        x1, x2 = self.split(x)
+        out = F.relu(self.bn1(self.conv1(x2)))
+        out = self.bn2(self.conv2(out))
+        out = F.relu(self.bn3(self.conv3(out)))
+        out = torch.cat([x1, out], 1)
+        out = self.shuffle(out)
+        return out
+
+
+class DownBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(DownBlock, self).__init__()
+        mid_channels = out_channels // 2
+        # left
+        self.conv1 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv2 = nn.Conv2d(in_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_channels)
+        # right
+        self.conv3 = nn.Conv2d(in_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(mid_channels)
+        self.conv4 = nn.Conv2d(mid_channels, mid_channels,
+                               kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False)
+        self.bn4 = nn.BatchNorm2d(mid_channels)
+        self.conv5 = nn.Conv2d(mid_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn5 = nn.BatchNorm2d(mid_channels)
+
+        self.shuffle = ShuffleBlock()
+
+    def forward(self, x):
+        # left
+        out1 = self.bn1(self.conv1(x))
+        out1 = F.relu(self.bn2(self.conv2(out1)))
+        # right
+        out2 = F.relu(self.bn3(self.conv3(x)))
+        out2 = self.bn4(self.conv4(out2))
+        out2 = F.relu(self.bn5(self.conv5(out2)))
+        # concat
+        out = torch.cat([out1, out2], 1)
+        out = self.shuffle(out)
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(self, net_size):
+        super(ShuffleNetV2, self).__init__()
+        out_channels = configs[net_size]['out_channels']
+        num_blocks = configs[net_size]['num_blocks']
+
+        self.conv1 = nn.Conv2d(3, 24, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(24)
+        self.in_channels = 24
+        self.layer1 = self._make_layer(out_channels[0], num_blocks[0])
+        self.layer2 = self._make_layer(out_channels[1], num_blocks[1])
+        self.layer3 = self._make_layer(out_channels[2], num_blocks[2])
+        self.conv2 = nn.Conv2d(out_channels[2], out_channels[3],
+                               kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels[3])
+        self.linear = nn.Linear(out_channels[3], 10)
+
+    def _make_layer(self, out_channels, num_blocks):
+        layers = [DownBlock(self.in_channels, out_channels)]
+        for i in range(num_blocks):
+            layers.append(BasicBlock(out_channels))
+            self.in_channels = out_channels
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        # out = F.max_pool2d(out, 3, stride=2, padding=1)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+configs = {
+    0.5: {
+        'out_channels': (48, 96, 192, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+
+    1: {
+        'out_channels': (116, 232, 464, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+    1.5: {
+        'out_channels': (176, 352, 704, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+    2: {
+        'out_channels': (224, 488, 976, 2048),
+        'num_blocks': (3, 7, 3)
+    }
+}
+
+
+def test():
+    net = ShuffleNetV2(net_size=0.5)
+    x = torch.randn(3, 3, 32, 32)
+    y = net(x)
+    print(y.shape)
+
+
+# test()
diff --git a/llvm/projects/pred_tuner/models/torch/vgg.py b/llvm/projects/pred_tuner/models/torch/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..2650d2f4859bedcef0de53a60c58c36b706148af
--- /dev/null
+++ b/llvm/projects/pred_tuner/models/torch/vgg.py
@@ -0,0 +1,39 @@
+"""VGG11/13/16/19 in Pytorch."""
+import torch.nn as nn
+from models.hpvm import HPVMConvBundle
+
+
+cfg = {
+    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class VGG(nn.Module):
+    def __init__(self, vgg_name):
+        super(VGG, self).__init__()
+        self.features = self._make_layers(cfg[vgg_name])
+        self.classifier = nn.Linear(512, 10)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = self.classifier(out)
+        return out
+
+    @staticmethod
+    def _make_layers(config):
+        layers = []
+        in_channels = 3
+        for x in config:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [HPVMConvBundle(in_channels, x, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(x),
+                           nn.ReLU(inplace=True)]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
diff --git a/llvm/projects/pred_tuner/run_tuner.py b/llvm/projects/pred_tuner/run_tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..5470763ae01b73b51702c413bd18254f4c5b0d2f
--- /dev/null
+++ b/llvm/projects/pred_tuner/run_tuner.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python
+#
+# Development-time Tuner with Algorithmic Approximations:
+# Approximations: Perforation, Sampling with varying knobs for rate, skip offset
+import copy
+import logging
+import os
+import shutil
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import numpy as np
+import opentuner
+from opentuner import ConfigurationManipulator, EnumParameter, MeasurementInterface
+from opentuner.measurement.inputmanager import FixedInputManager
+from opentuner.search.objective import ThresholdAccuracyMinimizeTime
+from opentuner.tuningrunmain import TuningRunMain
+from torch.nn import Module
+from tqdm import tqdm
+
+from exp import Benchmark, ConfigMeasurer, ExpState, TuningTime, batch_id, bench_tuner_data, is_dev_time
+from models import get_all_output, networks, QoS
+from toolkit import ConfigT
+from toolkit.estimators import WeightedLinearQoSEstimator
+from utils import Config, config, reapply_last_config
+
+msg_logger = logging.getLogger(__name__)
+use_proxy = False
+n_promise_valid_runs = 30
+confidence_level = 0.95
+
+
+def init_proxy(ni: ConfigMeasurer, pickle_path: Path):
+    def acc_crit(inputs_):
+        return ni.get_qos(inputs_, ni.val_loader)
+
+    def threshold_eval(inputs_):
+        accs = np.array([acc_crit(x) for x in inputs_])
+        return ni.val_qos - accs.mean() < 3.0
+
+    def run_model(net: Module):
+        return get_all_output(net, ni.val_loader)
+
+    return WeightedLinearQoSEstimator(
+        ni.nas, run_model, acc_crit, threshold_eval, confidence_level, storage=pickle_path
+    )
+
+
+class Timer:
+    def __init__(self, timer_state: TuningTime, timer_name: str):
+        self.timer_state = timer_state
+        self.name = timer_name
+        self.start = None
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, *args):
+        end = time.time()
+        interval = end - self.start
+        self.timer_state.add_timer(self.name, interval)
+
+
+class TunerDriver:
+    def __init__(self, bench: Benchmark):
+        self.bench = bench
+        msg_logger.info(f"Tuning for model {self.bench.model_name}")
+        # Initialize folder.
+        self._init_folder(bench)
+        # Take a snapshot of current code.
+        self.take_code_snapshot()
+        # Initialize network information and qos thresholds
+        self.net_info = ConfigMeasurer.init_from_bench(self.bench)
+        qoses = self.net_info.val_qos, self.net_info.test_qos
+        qos_type = self.net_info.val_qos.__class__
+        self.tuner_thres = qos_type.suggested_tuner_thresholds(self.net_info.val_qos)
+        self.val_thres = qos_type.suggested_val_threshold(self.net_info.val_qos)
+        self.test_thres = qos_type.suggested_test_threshold(self.net_info.test_qos)
+        # Tuner states.
+        self.states = ExpState(bench, qos_type, qoses)
+        # Current # of iteration. `ProxyTuner` will use this.
+        self.run_id, self.iter = 0, 0
+        # Initialize proxy.
+        if use_proxy:
+            self.proxy = init_proxy(self.net_info, self.bench.result_dir / 'proxy.pkl')
+        else:
+            self.proxy = None
+
+    @staticmethod
+    def _init_folder(bench: Benchmark):
+        def remove_file_or_folder(path: Path):
+            if path.is_dir():
+                shutil.rmtree(child)
+            elif path.is_file():
+                path.unlink()  # Removes file despite the surprising name
+
+        pickle_path = bench.result_dir / 'proxy.pkl'
+        # Remove everything in result folder except pickle file
+        if bench.result_dir.is_dir():
+            msg_logger.warning(f"!Cleaning existing result dir = {bench.result_dir}")
+            for child in bench.result_dir.glob('*'):
+                if child == pickle_path:
+                    continue
+                msg_logger.info(f"  !Removing {child}")
+                remove_file_or_folder(child)
+        # Create result folder if it doesn't exist
+        if not bench.result_dir.is_dir():
+            msg_logger.info(f"Creating output directory = {bench.result_dir}")
+            os.makedirs(bench.result_dir)
+
+    def get_default_args(self):
+        args = opentuner.default_argparser().parse_args()
+        args.database = f"opentuner.db/{batch_id}.db"
+        args.test_limit = self.bench.autotuner_runs
+        parent = Path(args.database).parent
+        if not parent.is_dir():
+            os.makedirs(parent, exist_ok=True)
+        return args
+
+    def tuner_exec(self):
+        # Get default opentuner args
+        args = self.get_default_args()
+        # Start tuning for each threshold
+        for i, thres in enumerate(self.tuner_thres):
+            with Timer(self.states.timers, f"tuning_{i}"):
+                msg_logger.info(
+                    f"Tuning goal: qos >= {thres}; keeping configs with qos >= {self.val_thres}"
+                )
+                tuner = ProxyTuner(args, self, thres, self.val_thres)
+                # TuningRunMain.__init__ initializes its own logger, so we'll reapply our settings.
+                tuning_main = TuningRunMain(tuner, args)
+                reapply_last_config()
+                # Unleash the tuner!
+                tuning_main.main()
+                # Remove tuner progress bar
+                tuner.pbar.close()
+                self.run_id += 1
+                self.iter = 0
+        # Postprocess configs
+        self.process_configs()
+
+    def calibrate_write_configs(self, configs: List[Config], is_test_set: bool):
+        write_to = self.states.tested_configs if is_test_set else self.states.validated_configs
+        gold_acc = self.net_info.test_qos if is_test_set else self.net_info.val_qos
+        for cfg in tqdm(configs, leave=False):
+            cfg = copy.deepcopy(cfg)
+            cfg: Config
+            flags = {k: v for k, v in enumerate(cfg.flags)}
+            measured_acc, confidence = self.net_info.actual_measure(
+                flags, cfg.total_runs, is_test_set, threshold=self.val_thres
+            )
+            prev_acc = cfg.avg_qos
+            cfg.update_acc(measured_acc, confidence, gold_acc)
+            new_acc = cfg.avg_qos
+            msg_logger.debug(f"{prev_acc} (mean) -> {new_acc} (mean)")
+            write_to.append(cfg)
+        write_to.finalize_dump()
+
+    @staticmethod
+    def filter_configs(
+            validation: List[Config], test: List[Config],
+            vali_threshold: QoS, test_threshold: QoS
+    ) -> Tuple[List[Config], List[Config]]:
+        # Filter validation and test set by their respective thresholds
+        filtered_validation = [
+            c for c in validation if c.avg_loss <= vali_threshold
+        ]
+        filtered_test = [
+            c for c in test if c.avg_loss <= test_threshold
+        ]
+        # Test configs also need to be a subset of validation configs.
+        name_to_filtered = {x.fname: x for x in filtered_test}
+        intersect_names = set(list(name_to_filtered.keys())).intersection(
+            set((x.fname for x in filtered_validation))
+        )
+        filtered_test_ = [name_to_filtered[fname] for fname in intersect_names]
+        return filtered_validation, filtered_test_
+
+    def process_configs(self):
+        # Finalize all configs because tuning is done.
+        # (this may not do anything now but will in the future)
+        self.states.all_configs.finalize_dump()
+        all_configs = self.states.all_configs.configs
+        # Pre-filter configs by a wide pareto margin
+        filtered_configs = config.is_pareto_efficient(all_configs, ratio=0.05, n_min=50, n_max=50)
+        msg_logger.info(f"Prefilter yields {len(filtered_configs)} configs from {len(all_configs)}")
+        self.states.filtered_configs.finalize_dump(with_configs=filtered_configs)
+        # Calibrate prefiltered configs (validation step)
+        with Timer(self.states.timers, "validate"):
+            self.calibrate_write_configs(filtered_configs, is_test_set=False)
+            validated_configs = self.states.validated_configs.configs
+        # Calibrate prefiltered configs on test set (test step)
+        with Timer(self.states.timers, "test"):
+            self.calibrate_write_configs(filtered_configs, is_test_set=True)
+            tested_configs = self.states.tested_configs.configs
+        # Filter valid and test set configs by thresholds
+        valid_configs, test_configs = self.filter_configs(
+            validated_configs, tested_configs, self.val_thres, self.test_thres
+        )
+        self.states.valid_configs.finalize_dump(valid_configs)
+        self.states.test_configs.finalize_dump(test_configs)
+        # Finalize data input and plot everything.
+        self.states.finalize_plot()
+
+    def take_code_snapshot(self):
+        import git
+        msg_logger.info(f"Taking git snapshot")
+        ref_dir = self.bench.result_dir / "references"
+        os.mkdir(ref_dir)
+        # Write current git commit (SHA id)
+        repo = git.Repo(search_parent_directories=True)
+        sha = repo.head.object.hexsha
+        msg_logger.info(f"Current code is at commit {sha}")
+        with (ref_dir / 'git_commit.txt').open('w') as f:
+            f.write(sha)
+        # Also put all outstanding code change in a diff file.
+        # This way changes in all git-tracked files are captured.
+        t = repo.head.commit.tree
+        with (ref_dir / 'diff.txt').open('w') as f:
+            f.write(repo.git.diff(t))
+
+    def make_config_name(self) -> str:
+        return f"{self.bench.model_name}_{self.run_id}_{self.iter}"
+
+    def get_accuracy(self, cfg: ConfigT) -> Tuple[QoS, QoS, int]:
+        has_promise_flags = set(cfg.values()).intersection(set(range(1, 7 + 1)))
+        config_validation_runs = n_promise_valid_runs if has_promise_flags else 1
+        if use_proxy:
+            mean_acc, confidence_acc = self.net_info.proxy_estimate(cfg, self.proxy)
+            assert has_promise_flags or (mean_acc == confidence_acc)
+        else:
+            mean_acc, _ = self.net_info.actual_measure(cfg, 1, is_test_set=False)
+            confidence_acc = mean_acc
+        return mean_acc, confidence_acc, config_validation_runs
+
+
+class ProxyTuner(MeasurementInterface):
+    def __init__(self, args, driver: TunerDriver, tuner_thres: QoS, accept_thres: QoS):
+        self.tuner_driver = driver
+        self.model_info = driver.net_info
+        self.bench = driver.bench
+        self.tuner_thres = tuner_thres
+        self.all_configs = driver.states.all_configs
+        self.pbar = tqdm(total=args.test_limit, leave=False)
+        objective = ThresholdAccuracyMinimizeTime(tuner_thres.to_scalar())
+        input_manager = FixedInputManager(size=driver.bench.get_n_layers())
+        super(ProxyTuner, self).__init__(
+            args, program_name=self.bench.model_name,
+            input_manager=input_manager, objective=objective
+        )
+        self.accept_thres = accept_thres
+
+    def manipulator(self) -> ConfigurationManipulator:
+        """Define the search space by creating a ConfigurationManipulator."""
+        manipulator = ConfigurationManipulator()
+        for ext_layer_id, knobs in self.model_info.get_knobs().items():
+            manipulator.add_parameter(EnumParameter(ext_layer_id, knobs))
+        return manipulator
+
+    def seed_configurations(self):
+        """Provide baseline config as seed if model uses seed."""
+        return [self.bench.get_baseline_config(not is_dev_time)] if self.bench.use_seed else []
+
+    def run(self, desired_result, input_, limit):
+        """Run a given configuration then return performance and accuracy."""
+        cfg: ConfigT = desired_result.configuration.data
+        # get_accuracy gives estimation of mean accuracy and 95% confident accuracy
+        mean_acc, confident_acc, n_runs = self.tuner_driver.get_accuracy(cfg)
+        # getConfigCost returns the cost associated with the selected configuration
+        total_comps, speedup = self.bench.compute_config_cost(cfg)
+        Result = opentuner.resultsdb.models.Result()
+        Result.time = total_comps
+        # Convert QoS to scalar, because opentuner does not support custom comparable datatype
+        Result.accuracy = confident_acc.to_scalar(relative_to=self.tuner_thres)
+
+        # If accuracy is acceptable, write this config
+        if confident_acc > self.accept_thres:
+            config_name = self.tuner_driver.make_config_name()
+            cfg_values = [cfg[layer] for layer in sorted(cfg.keys())]
+            writing_config = Config(
+                mean_acc, self.model_info.val_qos, config_name, cfg_values,
+                n_runs, 95.0, total_comps, speedup
+            )
+            self.all_configs.append(writing_config)
+            msg_logger.debug(
+                f"Config chosen with accuracy (mean) = {mean_acc}, (95%) = {confident_acc} "
+                f"and speedup = {speedup}"
+            )
+        self.tuner_driver.iter += 1
+        self.pbar.update()
+        return Result
+
+    def save_final_config(self, configuration):
+        """Print final configuration."""
+        msg_logger.info(f"Final configuration {configuration.data}")
+        msg_logger.info("Done with Autotuning run")
+
+
+if __name__ == '__main__':
+    assert set(networks.keys()).issubset(set(bench_tuner_data.keys()))
+    for network in ('alexnet2_hpvm',):
+        bench_: Benchmark = bench_tuner_data[network]
+        TunerDriver(bench_).tuner_exec()
diff --git a/llvm/projects/pred_tuner/tests/data/1_1_output.json b/llvm/projects/pred_tuner/tests/data/1_1_output.json
new file mode 100644
index 0000000000000000000000000000000000000000..3892ae9622a1af68e92b11408372e3d88278ed6a
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/data/1_1_output.json
@@ -0,0 +1,98 @@
+{
+  "('0', '0', '1', '1', '2', '0')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
+    "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
+    "ConvApproxHalf2": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,"
+  },
+  "('0', '0', '1', '1', '2', '1')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,",
+    "ConvApprox": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,",
+    "ConvApproxHalf2": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,"
+  },
+  "('0', '0', '1', '1', '3', '0')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,"
+  },
+  "('0', '0', '1', '1', '3', '1')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,"
+  },
+  "('0', '0', '1', '1', '4', '0')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
+    "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
+    "ConvApproxHalf2": "31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,"
+  },
+  "('0', '0', '1', '1', '4', '1')": {
+    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
+    "ConvSampSim": "37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,",
+    "ConvApprox": "37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,",
+    "ConvApproxHalf2": "37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,"
+  },
+  "('1', '1', '1', '1', '2', '0')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  },
+  "('1', '1', '1', '1', '2', '1')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  },
+  "('1', '1', '1', '1', '3', '0')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  },
+  "('1', '1', '1', '1', '3', '1')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  },
+  "('1', '1', '1', '1', '4', '0')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  },
+  "('1', '1', '1', '1', '4', '1')": {
+    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,",
+    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,"
+  }
+}
diff --git a/llvm/projects/pred_tuner/tests/data/3_3_output.json b/llvm/projects/pred_tuner/tests/data/3_3_output.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ccb23c01c7faff1e1c296f5d5bb667633327687
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/data/3_3_output.json
@@ -0,0 +1,146 @@
+{
+  "('0', '0', '1', '1', '2', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "26.000000,26.000000,26.000000,26.000000,",
+    "ConvApprox": "26.000000,26.000000,26.000000,26.000000,",
+    "ConvApproxHalf2": "26.000000,26.000000,26.000000,26.000000,"
+  },
+  "('0', '0', '1', '1', '2', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "56.000000,56.000000,56.000000,56.000000,",
+    "ConvApprox": "56.000000,56.000000,56.000000,56.000000,",
+    "ConvApproxHalf2": "56.000000,56.000000,56.000000,56.000000,"
+  },
+  "('0', '0', '1', '1', '3', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "39.000000,39.000000,39.000000,39.000000,",
+    "ConvApprox": "39.000000,39.000000,39.000000,39.000000,",
+    "ConvApproxHalf2": "39.000000,39.000000,39.000000,39.000000,"
+  },
+  "('0', '0', '1', '1', '3', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "42.000000,42.000000,42.000000,42.000000,",
+    "ConvApprox": "42.000000,42.000000,42.000000,42.000000,",
+    "ConvApproxHalf2": "42.000000,42.000000,42.000000,42.000000,"
+  },
+  "('0', '0', '1', '1', '4', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,",
+    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,",
+    "ConvApproxHalf2": "35.968750,35.968750,35.968750,35.968750,"
+  },
+  "('0', '0', '1', '1', '4', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "45.333336,45.333336,45.333336,45.333336,",
+    "ConvApprox": "45.333336,45.333336,45.333336,45.333336,",
+    "ConvApproxHalf2": "45.312500,45.312500,45.312500,45.312500,"
+  },
+  "('1', '1', '1', '1', '2', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
+    "ConvApprox": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
+    "ConvApproxHalf2": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,"
+  },
+  "('1', '1', '1', '1', '2', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
+    "ConvApprox": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
+    "ConvApproxHalf2": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,"
+  },
+  "('1', '1', '1', '1', '3', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,"
+  },
+  "('1', '1', '1', '1', '3', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,"
+  },
+  "('1', '1', '1', '1', '4', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
+    "ConvApprox": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
+    "ConvApproxHalf2": "16.000000,22.671875,22.671875,13.328125,25.328125,35.968750,35.968750,22.656250,25.328125,35.968750,35.968750,22.656250,18.671875,25.328125,25.328125,16.000000,"
+  },
+  "('1', '1', '1', '1', '4', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
+    "ConvApprox": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
+    "ConvApproxHalf2": "18.656250,29.343750,29.343750,20.000000,29.328125,45.312500,45.312500,29.343750,29.328125,45.312500,45.312500,29.343750,20.000000,29.328125,29.328125,18.656250,"
+  },
+  "('1', '1', '2', '2', '2', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "12.000000,18.000000,18.000000,26.000000,",
+    "ConvApprox": "12.000000,18.000000,18.000000,26.000000,",
+    "ConvApproxHalf2": "12.000000,18.000000,18.000000,26.000000,"
+  },
+  "('1', '1', '2', '2', '2', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "24.000000,36.000000,36.000000,56.000000,",
+    "ConvApprox": "24.000000,36.000000,36.000000,56.000000,",
+    "ConvApproxHalf2": "24.000000,36.000000,36.000000,56.000000,"
+  },
+  "('1', '1', '2', '2', '3', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.000000,27.000000,25.500000,39.000000,",
+    "ConvApprox": "18.000000,27.000000,25.500000,39.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,25.500000,39.000000,"
+  },
+  "('1', '1', '2', '2', '3', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.000000,27.000000,28.500000,42.000000,",
+    "ConvApprox": "18.000000,27.000000,28.500000,42.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,28.500000,42.000000,"
+  },
+  "('1', '1', '2', '2', '4', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "16.000000,22.666666,25.333334,36.000000,",
+    "ConvApprox": "16.000000,22.666666,25.333334,36.000000,",
+    "ConvApproxHalf2": "16.000000,22.671875,25.328125,35.968750,"
+  },
+  "('1', '1', '2', '2', '4', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.666668,29.333332,29.333332,45.333336,",
+    "ConvApprox": "18.666668,29.333332,29.333332,45.333336,",
+    "ConvApproxHalf2": "18.656250,29.343750,29.328125,45.312500,"
+  }
+}
\ No newline at end of file
diff --git a/llvm/projects/pred_tuner/tests/data/promise.json b/llvm/projects/pred_tuner/tests/data/promise.json
new file mode 100644
index 0000000000000000000000000000000000000000..331ff8527a17a4ff26965e7252cc49a4c409375a
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/data/promise.json
@@ -0,0 +1,121 @@
+{
+  "1": [
+    [
+      -0.980938,
+      -1.976522,
+      -2.999873,
+      -4.095768,
+      -5.115182,
+      0.0,
+      5.075658,
+      3.972848,
+      2.912783,
+      2.051733,
+      1.004169,
+      1.002379
+    ],
+    45.213196
+  ],
+  "2": [
+    [
+      -1.017428,
+      -2.01491,
+      -2.951011,
+      -4.042611,
+      -4.954911,
+      0.0,
+      5.05412,
+      3.951638,
+      2.94989,
+      1.99723,
+      1.001167,
+      0.98796
+    ],
+    12.535809
+  ],
+  "3": [
+    [
+      -1.003108,
+      -2.006269,
+      -3.00263,
+      -3.97216,
+      -4.969401,
+      0.0,
+      5.012199,
+      4.028375,
+      2.950729,
+      2.004691,
+      1.004823,
+      0.991805
+    ],
+    4.886813
+  ],
+  "4": [
+    [
+      -1.006497,
+      -1.975768,
+      -3.031142,
+      -4.02248,
+      -5.061712,
+      0.0,
+      5.017349,
+      3.992676,
+      2.998843,
+      2.002693,
+      0.997514,
+      1.00649
+    ],
+    3.129643
+  ],
+  "5": [
+    [
+      -1.001629,
+      -1.976943,
+      -2.982565,
+      -3.964559,
+      -4.99636,
+      0.0,
+      4.992359,
+      3.984341,
+      2.990126,
+      2.005831,
+      1.000539,
+      1.003548
+    ],
+    2.181237
+  ],
+  "6": [
+    [
+      -1.003159,
+      -1.985892,
+      -3.005964,
+      -4.008651,
+      -4.992874,
+      0.0,
+      4.996098,
+      4.012099,
+      3.001986,
+      2.001431,
+      0.996138,
+      0.997394
+    ],
+    1.362949
+  ],
+  "7": [
+    [
+      -1.003133,
+      -1.99733,
+      -3.00755,
+      -4.007799,
+      -5.003314,
+      0.0,
+      5.000926,
+      3.993208,
+      2.988745,
+      2.00329,
+      0.99986,
+      0.995669
+    ],
+    0.6926
+  ]
+}
\ No newline at end of file
diff --git a/llvm/projects/pred_tuner/tests/data/quantization.json b/llvm/projects/pred_tuner/tests/data/quantization.json
new file mode 100644
index 0000000000000000000000000000000000000000..723eaa2b55bc067689beae34829d27d478a0c727
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/data/quantization.json
@@ -0,0 +1,58 @@
+{
+  "(-4, 6)": [
+    -0.132812,
+    -4.0,
+    0.179688,
+    -0.40625,
+    1.664062,
+    -2.90625,
+    0.6875,
+    0.960938,
+    6.0,
+    6.0,
+    2.484375,
+    2.992188
+  ],
+  "(-2, 2)": [
+    -0.109375,
+    -2.0,
+    0.1875,
+    -0.40625,
+    1.6875,
+    -2.0,
+    0.6875,
+    0.984375,
+    2.0,
+    2.0,
+    2.0,
+    2.0
+  ],
+  "(-25, 8)": [
+    -0.121094,
+    -25.0,
+    0.136719,
+    -0.507812,
+    1.683594,
+    -2.957031,
+    0.652344,
+    0.910156,
+    6.96875,
+    7.097656,
+    2.457031,
+    2.972656
+  ],
+  "(-10, 10)": [
+    -0.15625,
+    -10.0,
+    0.15625,
+    -0.46875,
+    1.640625,
+    -2.96875,
+    0.625,
+    0.9375,
+    6.953125,
+    7.1875,
+    2.5,
+    2.96875
+  ]
+}
\ No newline at end of file
diff --git a/llvm/projects/pred_tuner/tests/promise.py b/llvm/projects/pred_tuner/tests/promise.py
new file mode 100644
index 0000000000000000000000000000000000000000..59506d94251bfac4909b2236dc9480eb17b9ed70
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/promise.py
@@ -0,0 +1,87 @@
+import json
+from pathlib import Path
+
+import torch
+
+from toolkit import ModuleIndexer, NetApproxSelector
+from toolkit.approxdnn import PromiseSim, quantize_256
+from utils import compute_accuracy, init_by_name, run_concat_output
+
+eps = 1e-5
+delta = 0.05  # Allow for some variance in promise testing
+
+
+def gt_eps(tensor: torch.Tensor) -> bool:
+    return torch.any(tensor.abs() > eps).item()
+
+
+def compare_quant(groundtruth: dict):
+    input_tensor = torch.tensor([-0.1, -25, 0.2, -0.4, 1.7, -2.9, 0.7, 0.99, 7, 7.2, 2.5, 3])
+    for k, v in groundtruth.items():
+        from ast import literal_eval as make_tuple
+        gt = torch.tensor(v)
+        ours = quantize_256(input_tensor, *make_tuple(k))
+        if gt_eps(gt - ours):
+            print(
+                f"Quantization results differ by more than eps = {eps};\n"
+                f"parameters = {k}\ngroundtruth = {gt}\nours = {ours}"
+            )
+            raise RuntimeError
+
+
+def compare_promise(groundtruth: dict):
+    input_tensor = torch.tensor([-1, -2, -3, -4, -5, 0, 5, 4, 3, 2, 1, 1], dtype=torch.float)
+    N = 1000
+    for k, (gt_avg, gt_error) in groundtruth.items():
+        gt_avg = torch.tensor(gt_avg)
+        sum_, our_error = torch.zeros_like(input_tensor, dtype=torch.float), 0
+        for _ in range(N):
+            out = PromiseSim.add_promise_noise(input_tensor, int(k))
+            sum_ += out
+            our_error += torch.sum((out - input_tensor) ** 2).item()
+        our_avg = sum_ / N
+        our_error = our_error / N
+        print(gt_avg, our_avg)
+        if abs(our_error - gt_error) > delta * max(our_error, gt_error):
+            print(
+                f"Promise results differ by more than delta = {delta * 100:.1f}%;\n"
+                f"swing = {k}, groundtruth error = {gt_error}\nours = {our_error}"
+            )
+            raise RuntimeError
+
+
+def is_in_range(mean1: float, std1: float, mean2: float) -> bool:
+    return mean1 - 3.0 * std1 < mean2 < mean1 + 3.0 * std1
+
+
+def compare_accuracy():
+    baseline, testloader, _, shapes = init_by_name('lenet_hpvm')
+    baseline_dag = ModuleIndexer(baseline)
+    nas = NetApproxSelector(baseline_dag, dev_time_only=False)
+    # {0: 1} -> 98.4808 0.1195
+    approx1 = nas.apply_approx_by_config({3: 1})
+    acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader)
+    assert is_in_range(0.984808, 0.001195, acc1)
+    # {0: 2} -> 99.5933 0.0519
+    approx2 = nas.apply_approx_by_config({3: 2})
+    acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader)
+    assert is_in_range(0.995933, 0.000519, acc2)
+    # {0: 3} -> 99.6723 0.0347
+    approx3 = nas.apply_approx_by_config({3: 3})
+    acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader)
+    assert is_in_range(0.996723, 0.000347, acc3)
+    print("Accuracy test passed.")
+
+
+def main():
+    data_folder = Path(__file__).parent / 'data'
+    with open(data_folder / 'quantization.json') as f:
+        compare_quant(json.load(f))
+    with open(data_folder / 'promise.json') as f:
+        compare_promise(json.load(f))
+    compare_accuracy()
+    print("Tests passed.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/tests/resnet50.py b/llvm/projects/pred_tuner/tests/resnet50.py
new file mode 100644
index 0000000000000000000000000000000000000000..71711fbfd099d47ba047471ddde3423b297d0f56
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/resnet50.py
@@ -0,0 +1,33 @@
+from toolkit import ModuleIndexer, NetApproxSelector
+from utils import compute_accuracy, init_by_name, run_concat_output
+
+
+def float_eq(f1, f2):
+    return abs(f1 - f2) < 1e-5
+
+
+def main():
+    baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm')
+    baseline_dag = ModuleIndexer(baseline)
+    nas = NetApproxSelector(baseline_dag)
+    # baseline
+    baseline_output = run_concat_output(baseline_dag.module, testloader)
+    baseline_acc = compute_accuracy(baseline_output, testloader)
+    assert float_eq(baseline_acc, 0.773)
+    # {13: 242} -> 75.5
+    approx1 = nas.apply_approx_by_config({82: 242})
+    acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader)
+    assert float_eq(acc1, 0.755)
+    # {13: 242, 17: 247} -> 74.6
+    approx2 = nas.apply_approx_by_config({82: 242, 108: 247})
+    acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader)
+    assert float_eq(acc2, 0.746)
+    # {9: 237, 13: 242, 17: 247} -> 74.1
+    approx3 = nas.apply_approx_by_config({55: 237, 82: 242, 108: 247})
+    acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader)
+    assert float_eq(acc3, 0.741)
+    print("Accuracy test passed.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/tests/sampling.py b/llvm/projects/pred_tuner/tests/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..707506ef7b8312fda02ca646bd04d034c3eff6ea
--- /dev/null
+++ b/llvm/projects/pred_tuner/tests/sampling.py
@@ -0,0 +1,90 @@
+import json
+from copy import deepcopy
+from pathlib import Path
+from typing import Tuple
+
+import torch
+
+from models.hpvm import HPVMConvBundle
+from toolkit import Conv2dSampling, Conv2dSamplingFP16, FP16Approx
+
+eps = 1e-5, 0.05
+
+
+def sampling_3_3_consts() -> Tuple[torch.Tensor, torch.Tensor]:
+    input_tensor = torch.ones(1, 3, 4, 4)
+    # Filter has value [2, 1, 2, 1, 2, 1...]
+    filter_tensor = torch.ones(1, 3, 3, 3)
+    filter_tensor.view(-1)[::2] = 2
+    return input_tensor, filter_tensor
+
+
+def sampling_1_1_consts() -> Tuple[torch.Tensor, torch.Tensor]:
+    input_tensor = torch.ones(1, 9, 2, 2) * 2
+    filter_tensor = torch.ones(4, 9, 1, 1) * 2
+    return input_tensor, filter_tensor
+
+
+def parse_tensor_str(string: str) -> torch.Tensor:
+    # String has an extra ',' at the end, so skipping an empty string after split
+    entries = [float(s) for s in string.split(',')[:-1]]
+    return torch.tensor(entries).cuda()
+
+
+def compare_to_groundtruth(groundtruth: dict, const_func):
+    input_tensor, filter_tensor = const_func()
+    input_tensor = input_tensor.cuda()
+    o_ch, i_ch, h, w = filter_tensor.size()
+    assert h == w
+    for k, v in groundtruth.items():
+        def compare(groundtruth_t: torch.Tensor, ours_t: torch.Tensor, is_fp16: bool):
+            diff = groundtruth_t - ours_t
+            eps_ = eps[1] if is_fp16 else eps[0]
+            is_diff = torch.any(diff.abs() > eps_).item()
+            if is_diff:
+                print(
+                    f"Results differ by more than eps = {eps};\n"
+                    f"parameters = {k}\n"
+                    f"groundtruth = {groundtruth_t}\n"
+                    f"ours = {ours_t}"
+                )
+                raise RuntimeError
+
+        from ast import literal_eval as make_tuple
+        pad_h, pad_w, stride_h, stride_w, skip_every, offset = [int(s) for s in make_tuple(k)]
+        conv_layer = HPVMConvBundle(
+            i_ch, o_ch, h, stride=(stride_h, stride_w), padding=(pad_h, pad_w)
+        )
+        conv_layer.weight.data = filter_tensor
+        conv_layer.bias.data = torch.zeros_like(conv_layer.bias.data)
+        conv_layer = conv_layer.cuda()
+        our_baseline = conv_layer(input_tensor).flatten()
+        fp16 = FP16Approx(deepcopy(conv_layer))
+        our_fp16 = fp16(input_tensor).flatten()
+        sampling = Conv2dSampling(skip_every, offset, 1.0, deepcopy(conv_layer))
+        our_sampled = sampling(input_tensor).flatten()
+        sampling_fp16 = Conv2dSamplingFP16(skip_every, offset, 1.0, deepcopy(conv_layer))
+        our_sampled_fp16 = sampling_fp16(input_tensor).float().flatten()
+        groundtruth_baseline = parse_tensor_str(v['Baseline'])
+        compare(groundtruth_baseline, our_baseline, False)
+        groundtruth_sampled1 = parse_tensor_str(v['ConvApprox'])
+        compare(groundtruth_sampled1, our_sampled, False)
+        groundtruth_sampled2 = parse_tensor_str(v['ConvSampSim'])
+        compare(groundtruth_sampled2, our_sampled, False)
+        groundtruth_baseline_fp16 = parse_tensor_str(v['FP16_Baseline'])
+        compare(groundtruth_baseline_fp16, our_fp16, True)
+        groundtruth_sampled_fp16 = parse_tensor_str(v['ConvApproxHalf2'])
+        compare(groundtruth_sampled_fp16, our_sampled_fp16, True)
+
+
+def main():
+    data_folder = Path(__file__).parent / 'data'
+    with open(data_folder / '1_1_output.json') as f:
+        compare_to_groundtruth(json.load(f), sampling_1_1_consts)
+    with open(data_folder / '3_3_output.json') as f:
+        compare_to_groundtruth(json.load(f), sampling_3_3_consts)
+    print("Tests passed.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llvm/projects/pred_tuner/toolkit/__init__.py b/llvm/projects/pred_tuner/toolkit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..892b8c154269c99b7446c70182886b2ee92fc499
--- /dev/null
+++ b/llvm/projects/pred_tuner/toolkit/__init__.py
@@ -0,0 +1,4 @@
+from .approxdnn import Approximation, AvailableApproximations, Conv2dSampling, FP16Approx, \
+    PerforateConv2dStride, PromiseSim
+from .estimators import LinearCombEstimator, LinearEstimator, LinearQoSEstimator, WeightedLinearCombEstimator
+from .transform import ConfigT, NetApproxSelector, StateCapturer
diff --git a/llvm/projects/pred_tuner/toolkit/approxdnn.py b/llvm/projects/pred_tuner/toolkit/approxdnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..06abca85d521326749902e0058b8a88e3571a611
--- /dev/null
+++ b/llvm/projects/pred_tuner/toolkit/approxdnn.py
@@ -0,0 +1,442 @@
+"""All approximation techniques for torch.nn layers."""
+import abc
+from typing import Dict, Iterable, List, Optional, Type
+
+import torch
+from torch.nn import Linear, Module
+
+from models.hpvm import HPVMConvBundle
+from utils import get_tensorrt_dir
+
+
+def interpolate_first_dim(tensor: torch.Tensor, interp_indices: Iterable[int]):
+    def tensor_at(idx_: int):
+        if idx_ in interp_indices:
+            raise IndexError
+        if idx_ < 0 or idx_ >= tensor.size()[0]:
+            return torch.zeros_like(tensor[0])
+        return tensor[idx_]
+
+    for idx in interp_indices:
+        if idx < 0 or idx >= tensor.size()[0]:
+            raise IndexError
+        elif idx == 0:  # First row
+            tensor[idx] = tensor_at(1)
+        elif idx == tensor.size()[0] - 1:  # Last row
+            tensor[idx] = tensor_at(idx - 1)
+        else:  # Middle rows
+            tensor[idx] = (tensor_at(idx - 1) + tensor_at(idx + 1)) / 2.0
+    return tensor
+
+
+class Approximation(abc.ABC):
+    @property
+    @abc.abstractmethod
+    def deterministic(self) -> bool:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def devtime(self) -> bool:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def fp32(self) -> bool:
+        pass
+
+    @abc.abstractmethod
+    def apply(self, module: Module) -> Module:
+        pass
+
+    @abc.abstractmethod
+    def is_less_approx(self, other: 'Approximation') -> Optional[bool]:
+        pass
+
+    def __repr__(self):
+        return f"{self.__class__}({self.__dict__})"
+
+
+class PerforateConv2dStride(Approximation):
+    r"""Simulation of strided perforated convolution for `torch.nn.Conv2d`.
+
+        Perforated convolution skips computing some entries in the output and instead interpolates
+        these values, to reduce the number of float-ops needed to complete a convolution op.
+        In this implementation, selected rows or columns of the output are discarded and replaced
+        with linearly interpolated values from the neighboring rows or columns. Each channel is
+        considered independently.
+        This implementation gives the same output as actual perforated convolution but without the
+        performance benefit.
+
+        Parameters
+        ----------
+        direction_is_row : bool
+            If True, discard and interpolate rows, otherwise columns.
+        stride : int \in [2, +\infty)
+            Skip 1 row/column in the convolution kernel per `stride` elements.
+        offset : int \in [0, stride)
+            Skipped first row/column is `offset`.
+
+        Attributes
+        ----------
+        interp_axis : int :math:`\in \{2, 3\}`
+            The axis that will be perforated over. As the input is an NCHW tensor, if
+            `direction_is_row` then `interp_axis = 2`, otherwise `interp_axis = 3`.
+        stride : int :math:`\in [2, +\infty)`
+            Equal to parameter `stride`.
+        offset : int :math:`\in [0, stride)`
+            Equal to parameter `offset`.
+        """
+
+    def __init__(self, direction_is_row: bool, stride: int, offset: int, use_fp16: bool):
+        assert stride >= 2
+        assert 0 <= offset < stride
+        self.interp_axis = 2 if direction_is_row else 3
+        self.stride = stride
+        self.offset = offset
+        self.fp16 = use_fp16
+
+    @property
+    def deterministic(self) -> bool:
+        return True
+
+    @property
+    def devtime(self) -> bool:
+        return not self.fp16
+
+    @property
+    def fp32(self) -> bool:
+        return not self.fp16
+
+    def is_less_approx(self, other: Approximation) -> Optional[bool]:
+        return None
+
+    class PerforateConv2dStrideModule(Module):
+        def __init__(self, conv: HPVMConvBundle, approx: 'PerforateConv2dStride'):
+            super().__init__()
+            self.conv = conv
+            self.approx = approx
+            if self.approx.fp16:
+                self.conv = self.conv.half()
+
+        def forward(self, x: torch.Tensor):
+            if self.approx.fp16:
+                x = x.half()
+            x = self.conv.input_to_conv(x)
+            assert x.dim() == 4
+            # Put self.approx.interp_axis to first axis temporarily
+            x = x.transpose(0, self.approx.interp_axis)
+            interp_indices = torch.tensor(range(self.approx.offset, x.size(0), self.approx.stride))
+            x = interpolate_first_dim(x, interp_indices)
+            # Putting axes back
+            x = x.transpose(0, self.approx.interp_axis)
+            x = self.conv.conv_to_output(x)
+            if self.approx.fp16:
+                assert x.dtype == torch.float16
+            return x.float()
+
+    def apply(self, module: HPVMConvBundle) -> PerforateConv2dStrideModule:
+        return self.PerforateConv2dStrideModule(module, self)
+
+
+class Conv2dSampling(Approximation):
+    r"""Simulation of sampled convolution for `torch.nn.Conv2d`.
+
+    Skips some elements of the convolution kernel in a uniform, strided manner,
+    to reduce the amount of float-ops needed to compute each output entry.
+    This implementation gives the same output as actual sampled convolution but without the
+    performance benefit.
+
+    Parameters
+    ----------
+    skip_every: int
+        Skip 1 element in the convolution kernel per `skip_every` elements.
+    skip_offset : int :math:`\in [0, +\infty)`
+        Index of first element to be skipped.
+        For example, if `skip_every = 3` and `skip_offset = 1`, then indices skipped
+        will be [1, 4, 7, ...]
+    interp_rate : float
+        The weight will be compensated ("interpolated") with a ratio after skipping elements,
+        which is naturally equal to :math:`1 + (1 / (skip_every - 1)`.
+        `interp_rate` modifies this rate to :math:`1 + (1 / (skip_every - 1) \times interp_rate`.
+    use_fp16 : bool
+        Whether to use fp16 weight/input or not.
+    """
+
+    def __init__(
+            self, skip_every: int, skip_offset: int, interp_rate: float, use_fp16: bool
+    ):
+        assert skip_every >= 2 and skip_offset >= 0
+        self.skip_every = skip_every
+        self.skip_offset = skip_offset
+        self.interp_rate = interp_rate
+        self.fp16 = use_fp16
+
+    @property
+    def deterministic(self) -> bool:
+        return True
+
+    @property
+    def devtime(self) -> bool:
+        return not self.fp16
+
+    @property
+    def fp32(self) -> bool:
+        return not self.fp16
+
+    def is_less_approx(self, other: Approximation) -> Optional[bool]:
+        return None
+
+    @staticmethod
+    def sample_conv_weight(
+            interp_rate: float, skip_every: int, skip_offset: int, weight: torch.Tensor
+    ):
+        r"""Samples (skips & interpolates) convolution kernel according to parameters.
+
+        For a given `weight` tensor of shape `(C1, C2, H, W)`, sample each output channel
+        (on axis 0) independently.
+        Flatten each output channel tensor into 1 dim.
+        In normal cases, set elements at indices ``range(skip_offset, C_2 * H * W, skip_every)``
+        to 0.
+        However, if `skip_every` == `h` == `w` == 3, we may end up skipping the same whole rows for
+        each input channel, which is undesirable.
+        Instead, increment the offset by 1 for each input channel.
+        Last, multiplies the kernel by the inverse ratio of elements dropped for an interpolation.
+        """
+        if len(weight.shape) != 4:
+            raise ValueError("Conv2d weight should be 4-dimensional")
+        c1, c2, h, w = weight.shape
+        if skip_every == h == w == 3:
+            # Indices (0..h*w) to skip for each input channel
+            per_chan_skip_indices = [
+                range((i_chan + skip_offset) % skip_every, h * w, skip_every)
+                for i_chan in range(c2)
+            ]
+            # Indices (0..c2*h*w) for each output channel, created by adding i*h*w for ith channel.
+            skip_indices = torch.tensor([
+                x + i * h * w for i, per_chan in enumerate(per_chan_skip_indices)
+                for x in per_chan
+            ])
+        else:
+            # Indices (0..c2*h*w) to skip for each output channel
+            skip_indices = torch.arange(skip_offset, c2 * h * w, skip_every)
+        flat_weight = weight.reshape(c1, -1)
+        flat_weight[:, skip_indices] = 0
+        interp_rate = 1 + (1 / (skip_every - 1) * interp_rate)
+        flat_weight *= interp_rate
+        return flat_weight.reshape_as(weight)
+
+    def apply(self, module: HPVMConvBundle) -> HPVMConvBundle:
+        # Not copying weight tensor leads to memory leak
+        cloned_conv_w = module.weight.clone().detach()
+        module.weight.data = self.sample_conv_weight(
+            self.interp_rate, self.skip_every, self.skip_offset, cloned_conv_w
+        )
+        return module
+
+
+def quantize_256(tensor: torch.Tensor, range_min: float, range_max: float) -> torch.Tensor:
+    """Quantize a tensor so that only 256 unique float value exists."""
+    quantize_range = 256
+    input_range = range_max - range_min
+    mul = input_range / quantize_range
+    # Map tensor into [0, 256] range.
+    affined = (tensor - range_min) / mul
+    # Convert tensor to int and back to float so it will have
+    # 256 (actually 257!; following hpvm impl) unique float values [0, 256].
+    # Then reverse affine it to the original range.
+    quanted = torch.floor(affined).to(torch.int).to(torch.float)
+    quanted_float = quanted * mul + range_min
+    # Clip tensor
+    return torch.clamp(quanted_float, range_min, range_max)
+
+
+class PromiseSim(Approximation):
+    scaling_values = [0.75, 0.64, 0.336, 0.21, 0.168, 0.14, 0.11, 0.0784, 0.005]
+
+    def __init__(self, noise_level: int):
+        super().__init__()
+        self.noise_level = noise_level
+
+    @property
+    def deterministic(self) -> bool:
+        return False
+
+    @property
+    def devtime(self) -> bool:
+        return False
+
+    @property
+    def fp32(self) -> bool:
+        return False
+
+    def is_less_approx(self, other: Approximation) -> Optional[bool]:
+        if isinstance(other, PromiseSim):
+            return self.noise_level > other.noise_level
+        return None
+
+    def add_promise_noise(self, tensor: torch.Tensor):
+        scale = self.scaling_values[self.noise_level]
+        noise = torch.normal(
+            mean=0.0, std=scale, size=tensor.size(), device=tensor.device
+        )
+        return noise * tensor + tensor
+
+    class PromiseSimModule(Module):
+        def __init__(self, module: HPVMConvBundle, approx: 'PromiseSim'):
+            super().__init__()
+            self.input_r, weight_r, bias_r, self.output_r = module.conv_ranges
+            module.weight.data = quantize_256(module.weight, *weight_r)
+            if module.bias is not None:
+                module.bias.data = quantize_256(module.bias, *bias_r)
+            self.module = module
+            self.approx = approx
+
+        def forward(self, input_: torch.Tensor) -> torch.Tensor:
+            # Quantize input, weight, bias (see __init__), and add noise to input.
+            input_ = quantize_256(input_, *self.input_r)
+            input_ = self.approx.add_promise_noise(input_)
+            output = self.module(input_)
+            # Then again, quantize output.
+            return quantize_256(output, *self.output_r)
+
+    def apply(self, module: HPVMConvBundle) -> PromiseSimModule:
+        return self.PromiseSimModule(module, self)
+
+
+class FP16Approx(Approximation):
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def deterministic(self) -> bool:
+        return True
+
+    @property
+    def devtime(self) -> bool:
+        return False
+
+    @property
+    def fp32(self) -> bool:
+        return False
+
+    def is_less_approx(self, other: Approximation) -> Optional[bool]:
+        return None
+
+    class FP16ApproxModule(Module):
+        def __init__(self, module: Module):
+            super().__init__()
+            self.module = module.half()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            x: torch.Tensor = self.module(x.half())
+            assert x.dtype == torch.float16
+            return x.float()
+
+    def apply(self, module: Module) -> FP16ApproxModule:
+        return self.FP16ApproxModule(module)
+
+
+AllApproxesT = Dict[int, Approximation]
+TypeApproxesT = Dict[Type[Module], List[int]]
+
+
+class AvailableApproximations:
+    r"""Holds a list of all available "approximation info": approximation + properties.
+
+        For properties see `Approximation`.
+
+        Parameters
+        ----------
+        all_knobs: Dict[int, Approximation]
+            A dict from int index to (approximation, is_dev_time) pair.
+            Also see class function `from_global_knobs_file`.
+
+        Attributes
+        ----------
+        all_knobs : Dict[int, Approximation]
+            A mapping from approximation index to approximation info pair `(approximation, is_dev_time)`.
+        type_to_knobs : Dict[Type[Module], List[int]]
+            A mapping from network layer type (subtype of `torch.nn.Module`) to a list of indexes of
+            applicable approximations. Values of `type_to_knobs` are always valid keys in `all_knobs`.
+        """
+
+    def __init__(self, all_knobs: Dict[int, Approximation], type_to_knobs: TypeApproxesT):
+        self.all_knobs = all_knobs
+        self.type_to_knobs = type_to_knobs
+
+    @classmethod
+    def from_global_knobs_file(cls) -> 'AvailableApproximations':
+        """Read and parse global_knobs.txt to provide all knobs supported and their indexes.
+
+        Returns two things:
+        * Dict of indexes to (approximations, is_dev_time). Approximation is in the form of functions
+        with a layer input; see `ModuleReplacerT`.
+        * Dict of type of torch.nn.Module to a list of approximation indexes that can be applied to this
+        type of layer.
+        """
+        with (get_tensorrt_dir() / 'autotuner/data/global_knobs.txt').open() as f:
+            lines = f.readlines()
+        all_knobs = {}
+        promise_and_fp16 = []
+        for line in lines:
+            desc, knobs, _, _, _, _, _ = line.rstrip().split()
+            category, index = desc.split(',')
+            index = int(index)
+            if category in ('perf', 'perf_fp16'):
+                row, col, offset = [int(s) for s in knobs.split(',')]
+                if row > 1 and col > 1:
+                    raise ValueError("Perforation on both row and column is not supported")
+                if col == 1:
+                    direction_is_row, stride = True, row
+                else:
+                    direction_is_row, stride = False, col
+                all_knobs[index] = PerforateConv2dStride(
+                    direction_is_row, stride, offset, 'fp16' in category
+                )
+            elif category in ('samp', 'samp_fp16'):
+                stride, offset, interp_rate = knobs.split(',')
+                stride, offset, interp_rate = int(stride), int(offset), float(interp_rate)
+                all_knobs[index] = Conv2dSampling(
+                    stride, offset, interp_rate, 'fp16' in category
+                )
+            elif category == 'swing_level':
+                all_knobs[index] = PromiseSim(index)
+                promise_and_fp16.append(index)
+            elif category == 'fp16':
+                all_knobs[index] = FP16Approx()
+                promise_and_fp16.append(index)
+        type_to_knobs = {
+            HPVMConvBundle: list(all_knobs.keys()),
+            Linear: promise_and_fp16
+        }
+        return cls(all_knobs, type_to_knobs)
+
+    def items(self, dev_time: bool, ignore_fp32: bool) -> Dict[Type[Module], List[int]]:
+        """Give a list of applicable approximations for each layer type.
+
+        If dev_time is True, returns only devtime approximations, otherwise all approximations.
+        """
+
+        def remove_non_dev(type_to_knobs: TypeApproxesT) -> TypeApproxesT:
+            return {
+                k: [v for v in vs if self.all_knobs[v].devtime]
+                for k, vs in type_to_knobs.items()
+            }
+
+        def remove_fp32(type_to_knobs: TypeApproxesT) -> TypeApproxesT:
+            return {
+                k: [v for v in vs if not self.all_knobs[v].fp32]
+                for k, vs in type_to_knobs.items()
+            }
+
+        type_to_knobs_ = self.type_to_knobs
+        if dev_time:
+            type_to_knobs_ = remove_non_dev(type_to_knobs_)
+        if ignore_fp32:
+            type_to_knobs_ = remove_fp32(type_to_knobs_)
+        return type_to_knobs_
+
+    def __getitem__(self, item: int) -> Approximation:
+        """Returns the approximation info for given approximation index."""
+        return self.all_knobs[item]
diff --git a/llvm/projects/pred_tuner/toolkit/estimators.py b/llvm/projects/pred_tuner/toolkit/estimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd35331693c706df336a6e3a33d1c6098a6cb50
--- /dev/null
+++ b/llvm/projects/pred_tuner/toolkit/estimators.py
@@ -0,0 +1,383 @@
+import abc
+import gc
+import logging
+import pickle
+from math import sqrt
+from pathlib import Path
+from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar
+
+import numpy as np
+import torch
+from torch.nn import Module
+from tqdm import tqdm, trange
+
+from models.domains import QoS, qos_stats
+from .transform import ConfigT, NetApproxSelector
+
+ProfT = TypeVar('ProfT')
+NetOutputT = TypeVar('NetOutputT')
+QoST = Callable[[NetOutputT], QoS]
+ThresholdEvalT = Callable[[NetOutputT], bool]
+ExeT = Callable[[Module], NetOutputT]
+KeyT = Tuple[int, int]
+KVT = Tuple[KeyT, NetOutputT]
+EstmT = Tuple[QoS, QoS]
+
+msg_logger = logging.getLogger(__name__)
+
+
+class LinearEstimator(abc.ABC):
+    """Estimate QoS of a config by linearly adding "something" from each approximation of config, and
+    then applying QoS metric.
+
+    That "something" could be QoS itself (see `LinearQoSEstimator`), or the direct tensor output from
+    the model (see `LinearTensorEstimator`).
+    In initialization phase, run the model for each 1-approximation config and store the quantity to
+    be linearly summed in a table.
+
+    Parameters
+    ----------
+    nas: NetApproxSelector
+        `NetApproxSelector` instance is used to select all 1-approximation configs and evaluate them.
+    qos: Callable[[torch.Tensor], float]
+        Quality of Service measure (such as accuracy). Takes model output tensor and returns QoS value.
+    independent_init: bool
+        If False, don't initialize self.profile_table, and wait for `coinit_estimators` to fill in
+        the profile. `coinit_estimators` must be manually called if `init_profile` is False.
+
+    Attributes
+    ----------
+    qos : Callable[[torch.Tensor], float]
+        Same as parameter `qos`.
+    baseline_profile : T
+        Profile value of the baseline model.
+    profile_table : Dict[KeyT, T]
+        A mapping from (`layer_idx`, `approx_idx`) to the profile value, with only this approximation
+        applied (in other words, with configuration ``{layer_idx: approx_idx}`` applied).
+    """
+
+    n_nondeterm_runs = 10
+
+    def __init__(
+            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
+            threshold_eval: ThresholdEvalT, confidence_level: float,
+            independent_init: bool = True, storage: Path = None
+    ):
+        self.nas = nas
+        self.qos = qos
+        self.executor = executor
+        self.storage = storage
+        self.baseline_profile: ProfT = self.get_baseline_profile()
+        self.profile_table: Dict[KeyT, ProfT] = {}
+        self.confidence_level = confidence_level
+        if independent_init:
+            for (k, i), output in self._get_all_outputs(nas, self.executor, threshold_eval, storage):
+                self.profile_table[k, i] = self.handle_output(output)
+
+    @staticmethod
+    def _load_from_pickle(storage: Path) -> Iterator[KVT]:
+        if not storage.is_file():
+            return
+        msg_logger.info(f"Found pickle at {storage}")
+        with storage.open('rb') as f:
+            while True:
+                try:
+                    key, tensor = pickle.load(f)
+                    yield key, tensor
+                except EOFError:
+                    return
+
+    @classmethod
+    def run_model(cls, nas: NetApproxSelector, config: ConfigT, executor: ExeT) -> torch.Tensor:
+        is_deterministic = nas.is_deterministic(config)
+        model = nas.apply_approx_by_config(config).module
+        if is_deterministic:
+            ret = executor(model).unsqueeze(0).cpu()
+        else:
+            assert cls.n_nondeterm_runs > 0
+            ret = torch.stack([
+                executor(model)
+                for _ in trange(cls.n_nondeterm_runs, leave=False)
+            ]).cpu()
+        gc.collect()
+        return ret
+
+    @classmethod
+    def _get_all_outputs(
+            cls, nas: NetApproxSelector, executor: ExeT,
+            threshold_eval: ThresholdEvalT, storage: Path = None
+    ) -> Iterator[KVT]:
+        preloaded_acceptable = {}
+        if storage is not None:
+            bar = tqdm(cls._load_from_pickle(storage))
+            for key, tensor in bar:
+                bar.set_postfix(key=key)
+                preloaded_acceptable[key] = threshold_eval(tensor)
+                yield key, tensor
+
+        def evaluate(k: int, i: int) -> Tuple[bool, Optional[KVT]]:
+            if (k, i) in preloaded_acceptable:
+                msg_logger.debug(f"Key {(k, i)} is preloaded.")
+                return preloaded_acceptable[(k, i)], None
+            outputs = cls.run_model(nas, {k: i}, executor)
+            if storage is not None:
+                with storage.open('ab') as f:
+                    pickle.dump(((k, i), outputs), f)
+            return threshold_eval(outputs), ((k, i), outputs)
+
+        for key_outputs in nas.filter_approxes(evaluate):
+            # key_outputs is None means corresponding key has been preloaded (we can't see the key)
+            if key_outputs is None:
+                continue
+            yield key_outputs
+
+    @classmethod
+    def coinit_estimators(
+            cls, nas: NetApproxSelector, executor: ExeT, threshold_eval: ThresholdEvalT,
+            *estm_insts: 'LinearEstimator', storage: Path = None
+    ):
+        for (k, i), output in cls._get_all_outputs(nas, executor, threshold_eval, storage):
+            for inst in estm_insts:
+                inst.profile_table[(k, i)] = inst.handle_output(output)
+
+    @abc.abstractmethod
+    def get_baseline_profile(self) -> ProfT:
+        pass
+
+    @abc.abstractmethod
+    def handle_output(self, outputs: torch.Tensor) -> ProfT:
+        pass
+
+    @abc.abstractmethod
+    def estimate(self, config: ConfigT) -> EstmT:
+        pass
+
+
+class LinearQoSEstimator(LinearEstimator):
+    """Estimate QoS of a config by linearly adding QoS value. See `LinearEstimator`.
+
+    ProfT = Tuple[QoS(mean), QoS(std)]
+    NetOutputT = torch.Tensor
+    """
+
+    def estimate(self, config: ConfigT) -> EstmT:
+        baseline_mean: QoS = self.baseline_profile[0]
+        if not config:
+            return baseline_mean, baseline_mean
+        # N * 2 array
+        profiles = np.array([self.profile_table[kv] for kv in config.items()])
+        profiles[:, 0] -= baseline_mean
+        estm_qos = profiles[:, 0].sum() + baseline_mean
+        estm_std = sqrt(np.sum(profiles[:, 1] ** 2))
+        # We're hardcoding 95% confidence interval here.
+        assert self.confidence_level == 0.95
+        normal_dist_95 = 1.644854
+        r1, r2 = estm_qos, estm_qos - normal_dist_95 * estm_std
+        return float(r1), float(r2)
+
+    def handle_output(self, outputs: torch.Tensor) -> Tuple[QoS, QoS]:
+        qoses = np.array([self.qos(o) for o in outputs])
+        msg_logger.debug(f"Handled {qoses.mean(), qoses.std()}")
+        return qoses.mean(), qoses.std()
+
+    def get_baseline_profile(self) -> Tuple[QoS, QoS]:
+        mean_qos = self.qos(self.run_model(self.nas, {}, self.executor)[0])
+        return mean_qos, mean_qos.null()
+
+
+class LinearCombEstimator(LinearEstimator):
+    """Estimate QoS of a config by linearly adding tensor output from network. See `LinearEstimator`.
+
+    On estimation, sums over the delta in tensor output (compared to baseline output) for each
+    approximation, and then the baseline tensor output is added back.
+    This works as an estimation of tensor output for this configuration, which is then sent to QoS
+    metric to get the final QoS.
+
+    QoST = float
+    ProfT = torch.Tensor (2 * n_inputs * n_classes)
+    NetOutputT = torch.Tensor (n_inputs * n_classes)
+    """
+
+    def estimate(self, config) -> EstmT:
+        if not config:
+            baseline_qos = self.qos(self.baseline_profile)
+            return baseline_qos, baseline_qos
+        # 4D tensor: n_approx * 2 * n_inputs * n_classes
+        profiles = torch.stack([self.profile_table[kv] for kv in config.items()])
+        profiles -= self.baseline_profile
+        mean_tensor, confidence_tensor = profiles.sum(dim=0) + self.baseline_profile
+        estm_mean_qos = self.qos(mean_tensor)
+        estm_confidence_qos = self.qos(confidence_tensor)
+        return estm_mean_qos, estm_confidence_qos
+
+    def handle_output(self, outputs: torch.Tensor) -> torch.Tensor:
+        if len(outputs) == 1:
+            return torch.stack((outputs[0], outputs[0]))
+        qoses = np.array([self.qos(o) for o in outputs])
+        percentile_pos = int(self.n_nondeterm_runs * (1 - self.confidence_level))
+        assert 0 <= percentile_pos < self.n_nondeterm_runs
+        mean_pos = np.searchsorted(qoses, qoses.mean(), 'right')
+        assert 0 <= mean_pos <= self.n_nondeterm_runs
+        if mean_pos == self.n_nondeterm_runs:
+            mean_pos = self.n_nondeterm_runs - 1
+        return torch.stack((outputs[mean_pos], outputs[percentile_pos]))
+
+    def get_baseline_profile(self) -> torch.Tensor:
+        return self.run_model(self.nas, {}, self.executor)[0]
+
+
+class TrainableEstimator(LinearEstimator, abc.ABC):
+    """
+    QoST = float
+    ProfT = ProfT
+    NetOutputT = torch.Tensor (n_inputs * n_classes)
+    """
+    n_train_confs = 50
+    weight_range = 0.8, 1.2, 20
+    n_cold_start = 500
+    accept_threshold = 5
+    penalize_overestm = 1.0
+
+    def __init__(
+            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
+            threshold_eval: ThresholdEvalT, confidence_level: float,
+            independent_init: bool = True, storage: Path = None
+    ):
+        super().__init__(nas, executor, qos, threshold_eval, confidence_level, independent_init, storage)
+        self.r_cands = np.linspace(*self.weight_range)
+        self.r_error = np.zeros((len(self.r_cands), self.n_train_confs))
+        self.r = self.weight_range[1]
+        self.trained_iters = 0
+        self.cold_start = 0
+
+    def update_r(self):
+        mean_error = np.mean(self.r_error, axis=1)
+        best_idx = np.argmin(mean_error)
+        self.r = self.r_cands[best_idx]
+        if best_idx == len(mean_error) - 1 or best_idx == 0:
+            msg_logger.warning(f"Parameter value r = {self.r} has reached the boundary. Consider a larger range.")
+
+    def get_qos_for_config(self, config: ConfigT) -> EstmT:
+        is_deterministic = self.nas.is_deterministic(config)
+        net = self.nas.apply_approx_by_config(config).module
+        n_runs = 1 if is_deterministic else self.n_nondeterm_runs
+        qoses = [self.qos(self.executor(net)) for _ in trange(n_runs, leave=False)]
+        mean_qos, qos_at_confidence, _ = qos_stats(qoses, confidence=self.confidence_level)
+        return mean_qos, qos_at_confidence
+
+    @abc.abstractmethod
+    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
+        pass
+
+    def estimate(self, config) -> EstmT:
+        estm = self.real_estimate(config)[0]
+        if self.cold_start < self.n_cold_start:
+            self.cold_start += 1
+            if self.cold_start % 50 == 0:
+                msg_logger.info(f"WeightedLinearCombEstimator cold start {self.cold_start} / {self.n_cold_start}")
+            return estm
+        if self.trained_iters >= self.n_train_confs:
+            return estm
+        log_info_freq = 10
+        log_level = logging.INFO if self.trained_iters % log_info_freq == 0 else logging.DEBUG
+        msg_logger.log(
+            log_level,
+            f"{self.__class__} train iter {self.trained_iters} / {self.n_train_confs}"
+        )
+        mean_qos, qos_at_confidence = self.get_qos_for_config(config)
+        estm_conf_qoses = np.array(self.real_estimate(config, rs=self.r_cands))[:, 1]
+        diff_conf_qoses = qos_at_confidence - estm_conf_qoses
+        old_r = self.r
+        self.r_error[:, self.trained_iters] = np.where(
+            diff_conf_qoses > 0, diff_conf_qoses * self.penalize_overestm,
+            -diff_conf_qoses
+        )
+        self.trained_iters += 1
+        self.update_r()
+        msg_logger.debug(
+            f"{self.__class__} real mean qos = {mean_qos}, real conf qos = {qos_at_confidence}, "
+            f"estm conf qos = {estm[1]}, r: {old_r} -> {self.r}"
+        )
+        return mean_qos, qos_at_confidence
+
+
+class WeightedLinearCombEstimator(TrainableEstimator, LinearCombEstimator):
+    """
+    QoST = float
+    ProfT = torch.Tensor
+    NetOutputT = torch.Tensor (n_inputs * n_classes), logged
+    """
+
+    def __init__(
+            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
+            threshold_eval: ThresholdEvalT, confidence_level: float,
+            independent_init: bool = True, storage: Path = None
+    ):
+        log_qos = lambda x: qos(torch.exp(x))
+        super().__init__(nas, executor, log_qos, threshold_eval, confidence_level, independent_init, storage)
+
+    @staticmethod
+    def tensor_log(tensor: torch.Tensor) -> torch.Tensor:
+        # TODO: don't take log if there's no SoftMax layer.
+        eps = torch.ones_like(tensor) * 1e-10
+        return torch.log(torch.max(tensor, eps))
+
+    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
+        # 3D tensor: 2 * n_inputs * n_classes
+        if config:
+            estm_delta_output = torch.sum(
+                torch.stack([self.profile_table[kv] for kv in config.items()]) - self.baseline_profile,
+                dim=0
+            )
+        else:
+            n_in, n_out = self.baseline_profile.shape
+            estm_delta_output = torch.zeros(2, n_in, n_out)
+        rets = []
+        rs = rs if rs is not None else [self.r]
+        for r in rs:
+            mean_tensor, confidence_tensor = estm_delta_output * r + self.baseline_profile
+            rets.append((self.qos(mean_tensor), self.qos(confidence_tensor)))
+        return rets
+
+    def handle_output(self, outputs: torch.Tensor) -> torch.Tensor:
+        return LinearCombEstimator.handle_output(self, self.tensor_log(outputs))
+
+    def get_baseline_profile(self) -> torch.Tensor:
+        return self.tensor_log(LinearCombEstimator.get_baseline_profile(self))
+
+
+class WeightedLinearQoSEstimator(TrainableEstimator, LinearQoSEstimator):
+    """
+    QoST = float
+    ProfT = torch.Tensor
+    NetOutputT = torch.Tensor (n_inputs * n_classes), logged
+    """
+
+    weight_range = 0.5, 5, 50
+
+    def estimate(self, config) -> EstmT:
+        ret = super().estimate(config)
+        msg_logger.debug(f"Config {config} -> estimation {ret}")
+        return ret
+
+    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
+        baseline_mean_qos = self.baseline_profile[0]
+        if config:
+            # N * 2 array
+            profiles = np.array([self.profile_table[kv] for kv in config.items()])
+            profiles[:, 0] -= baseline_mean_qos
+            profiles[:, 0][profiles[:, 0] > 0] = 0
+            estm_mean_qos_delta = profiles[:, 0].sum()
+            estm_std = sqrt(np.sum(profiles[:, 1] ** 2))
+        else:
+            estm_mean_qos_delta = estm_std = 0.0
+        rets = []
+        rs = rs if rs is not None else [self.r]
+        for r in rs:
+            estm_mean_qos = float(estm_mean_qos_delta * r + baseline_mean_qos)
+            # We're hardcoding 95% confidence interval here.
+            assert self.confidence_level == 0.95
+            normal_dist_95 = 1.644854
+            estm_conf_qos = estm_mean_qos - normal_dist_95 * estm_std
+            rets.append((estm_mean_qos, estm_conf_qos))
+        return rets
diff --git a/llvm/projects/pred_tuner/toolkit/indexing.py b/llvm/projects/pred_tuner/toolkit/indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..27500c152ac5130f6df787f16f53e84c3099bcf6
--- /dev/null
+++ b/llvm/projects/pred_tuner/toolkit/indexing.py
@@ -0,0 +1,55 @@
+from typing import Callable, Iterator, Optional, Set
+
+import torch
+from torch.nn import Module, Sequential
+
+UnaryForwardT = Callable[[torch.Tensor], torch.Tensor]
+ReplacedForwardT = Callable[[Module, UnaryForwardT, torch.Tensor], torch.Tensor]
+
+
+class ModuleIndexer:
+    def __init__(self, module: Module, ignore_module: Callable[[Module], bool]):
+        self.module_to_index = {}
+        for i, submodule in enumerate(module.modules()):
+            if ignore_module(submodule):
+                continue
+            self.module_to_index[submodule] = i
+        self.index_to_module = {i: m for m, i in self.module_to_index.items()}
+        self.module = module
+        self.layer_parents = self.find_layers_parent_info(module, set(self.all_modules))
+
+    @staticmethod
+    def find_layers_parent_info(net: Module, layers: Set[Module]):
+        ret = {}
+        for name, submodule in net.named_children():
+            if submodule in layers:
+                ret[submodule] = net, name
+            ret = {**ret, **ModuleIndexer.find_layers_parent_info(submodule, layers)}
+        return ret
+
+    @property
+    def all_modules(self) -> Iterator[Module]:
+        return iter(self.module_to_index.keys())
+
+    def find(self, module: Module) -> Optional[int]:
+        return self.module_to_index.get(module, None)
+
+    def __getitem__(self, item: int) -> Module:
+        return self.index_to_module[item]
+
+    def __setitem__(self, key: int, value: Module):
+        old = self.index_to_module[key]
+        if value != old:
+            self.index_to_module[key] = value
+            self.module_to_index[value] = self.module_to_index[old]
+            self.module_to_index.pop(old)
+            parent, name = self.layer_parents[old]
+            self.layer_parents[value] = parent, name
+            self.layer_parents.pop(old)
+            parent.__setattr__(name, value)
+
+    def __iter__(self) -> Iterator[Module]:
+        return self.all_modules
+
+    def __len__(self):
+        return len(self.module_to_index)
diff --git a/llvm/projects/pred_tuner/toolkit/transform.py b/llvm/projects/pred_tuner/toolkit/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..f19554181a9bb9ac10ee9261cd908c2003f18d48
--- /dev/null
+++ b/llvm/projects/pred_tuner/toolkit/transform.py
@@ -0,0 +1,186 @@
+import copy
+import logging
+from collections import defaultdict
+from typing import Callable, Dict, Generic, Iterator, List, Tuple, TypeVar
+
+from torch.nn import Module
+
+from .approxdnn import Approximation, AvailableApproximations
+from .indexing import ModuleIndexer
+
+msg_logger = logging.getLogger(__name__)
+
+
+T1 = TypeVar('T1')
+T2 = TypeVar('T2')
+TransformerCT = Callable[[int, T1], T2]
+
+
+class StateCapturer(Module, Generic[T2]):
+    @staticmethod
+    def _id(_, x):
+        return x.clone().cpu().detach()
+
+    def __init__(self, net_index: ModuleIndexer, state_transformer: TransformerCT = None):
+        super().__init__()
+        self.net_state: Dict[int, List[T2]] = defaultdict(list)
+        self.state_transformer = state_transformer or self._id
+        self.net_index = net_index
+        for submodule in net_index.module.modules():
+            submodule.register_forward_hook(self.forward_hook)
+        self._output = None
+
+    @property
+    def module(self):
+        return self.net_index.module
+
+    @property
+    def output(self):
+        if self._output is None:
+            raise RuntimeError("Cannot get output before inference happens")
+        return self._output
+
+    def forward_hook(self, module: Module, _, outputs):
+        module_idx = self.net_index.find(module)
+        if module_idx is None:
+            raise RuntimeError("Cannot find module; module may have changed externally")
+        self.net_state[module_idx].append(self.state_transformer(module_idx, outputs))
+
+    def forward(self, *args, **kwargs):
+        return self.module.forward(*args, **kwargs)
+
+    def get_output_state(self) -> List[T2]:
+        return self.net_state[self.injected.output_loc()]
+
+
+T = TypeVar('T')
+ConfigT = Dict[int, int]
+EvaluatorT = Callable[[int, int], Tuple[bool, T]]
+
+
+class NetApproxSelector:
+    r"""List all 1-approximation configurations, and apply configurations to a `ModuleDAG` network.
+
+    Computes a list of available approximations for each layer of the network, given info on available
+    approximations in the system (in the form of an `AvailableApproximations` instance).
+    Capable of listing all single-approximation configurations, and apply a given configuration to the network.
+    A configuration is a dict from layer indices to approximation for these layers, one for each.
+    See `ConfigT`.
+
+    Parameters
+    ----------
+    net : Module
+        The network to be approximated.
+    dev_time_only : bool
+        If True, use only devtime approximations; otherwise use all available approximations.
+    aa : AvailableApproximations
+        A container with information of available approximations, and the type of layer each approximation
+        applies to, etc.
+
+    Attributes
+    ----------
+    net : Module
+        The network to be approximated (parameter `net`).
+    net_approxes: Dict[int, List[int]]
+        A list of available approximation indexes per layer index.
+    available_approx: AvailableApproximations
+        Available approximations (parameter `aa`).
+    """
+
+    class ApproximationGraph:
+        """Naive O(n^2) sort for a list of partially-ordered approximations."""
+
+        def __init__(self, approx_indices: List[int], aa: AvailableApproximations):
+            import networkx as nx
+            self.dep_graph = nx.DiGraph()
+            self.dep_graph.add_nodes_from(approx_indices)
+            for i, x in enumerate(approx_indices):
+                for y in approx_indices[i + 1:]:
+                    approx_x, approx_y = aa[x], aa[y]
+                    cmp = approx_x.is_less_approx(approx_y)
+                    if cmp is None:  # Not comparable
+                        continue
+                    if cmp:
+                        self.dep_graph.add_edge(x, y)
+                    else:
+                        self.dep_graph.add_edge(y, x)
+            self.sorted_indices = list(nx.algorithms.topological_sort(self.dep_graph))
+
+        def __len__(self) -> int:
+            return len(self.sorted_indices)
+
+        def __iter__(self) -> Iterator[Tuple[int, bool]]:
+            return iter(self.sorted_indices)
+
+    def __init__(
+            self, net: Module, dev_time_only: bool = True, ignore_fp32: bool = False,
+            aa: AvailableApproximations = None
+    ):
+        self.available_approx = aa or AvailableApproximations.from_global_knobs_file()
+        self.type_approxes = self.available_approx.items(dev_time=dev_time_only, ignore_fp32=ignore_fp32)
+        approximable_types = tuple(self.type_approxes.keys())
+        self.net_index = ModuleIndexer(net, lambda m: not isinstance(m, approximable_types))
+        self.dev_time_only = dev_time_only
+        self.net_approxes: Dict[int, List[int]] = defaultdict(list)
+        for i, layer in self.net_index.index_to_module.items():
+            for t, approxes in self.type_approxes.items():
+                if isinstance(layer, t):
+                    self.net_approxes[i].extend(approxes)
+
+    def apply_approx_by_config(self, config: ConfigT) -> ModuleIndexer:
+        """Applies given `config` to network."""
+        new_dag = copy.deepcopy(self.net_index)
+        for layer_idx, config_idx in config.items():
+            layer = new_dag[layer_idx]
+            new_dag[layer_idx] = self.available_approx[config_idx].apply(layer)
+        return new_dag
+
+    def list_single_approxes(self) -> Iterator[Tuple[int, int, Approximation]]:
+        for k, vs in self.net_approxes.items():
+            for v in vs:
+                yield k, v, self.available_approx[v]
+
+    def filter_approxes(self, evaluator: EvaluatorT) -> Iterator[T]:
+        """Enumerate through and apply each single-approximation configuration."""
+        net_approxes_graph: Dict[int, NetApproxSelector.ApproximationGraph] = {
+            k: self.ApproximationGraph(vs, self.available_approx) for k, vs in self.net_approxes.items()
+        }
+        from tqdm import tqdm
+        from utils import gpu_mem_mb
+        bar1 = tqdm(net_approxes_graph.items(), total=len(net_approxes_graph))
+        for k, graph in bar1:
+            bar1.set_postfix(layer=k)
+            bar2 = tqdm(graph, leave=None)
+            unacceptable_approx = None
+            filtered_layer_approxes = []
+            for approx_id in bar2:
+                approx = self.available_approx[approx_id]
+                if unacceptable_approx is not None:
+                    cmp = unacceptable_approx.is_less_approx(approx)
+                    if cmp:
+                        msg_logger.debug(f"{approx} is worse than unacceptable approx {unacceptable_approx}")
+                        continue
+                    else:
+                        unacceptable_approx = None
+                bar2.set_postfix(approx_id=approx_id, mem=gpu_mem_mb())
+                acceptable, ret_val = evaluator(k, approx_id)
+                if not acceptable:
+                    unacceptable_approx = approx
+                    msg_logger.debug(f"{approx} is unacceptable")
+                    continue
+                filtered_layer_approxes.append(approx_id)
+                yield ret_val
+            self.net_approxes[k] = filtered_layer_approxes
+
+    def get_baseline(self) -> Module:
+        return self.net_index.module
+
+    def get_layer_approxes(self) -> Dict[Module, List[int]]:
+        """Expose available knobs for autotuner usage."""
+        return {
+            self.net_index[layer_k]: approxes
+            for layer_k, approxes in self.net_approxes.items()
+        }
+
+    def is_deterministic(self, config: ConfigT):
+        return all(self.available_approx[knob_id].deterministic for knob_id in config.values())
diff --git a/llvm/projects/pred_tuner/utils/__init__.py b/llvm/projects/pred_tuner/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f06b4ae222c3a8a56d4ab4516031e4c91dfa0d2
--- /dev/null
+++ b/llvm/projects/pred_tuner/utils/__init__.py
@@ -0,0 +1,3 @@
+from .config import Config
+from .logging import config_pylogger, reapply_last_config
+from .utils import device, get_knob_config_file, get_tensorrt_dir, gpu_mem_mb
diff --git a/llvm/projects/pred_tuner/utils/benchmarks.json b/llvm/projects/pred_tuner/utils/benchmarks.json
new file mode 100644
index 0000000000000000000000000000000000000000..57184872a07de661c1c9ee4064ec01652e9966ff
--- /dev/null
+++ b/llvm/projects/pred_tuner/utils/benchmarks.json
@@ -0,0 +1,100 @@
+{
+  "lenet_hpvm": {
+    "model_name": "lenet_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/lenet_keras/",
+    "layer_file": "autotuner/data/lenet/lenet_layers.txt",
+    "cost_file": "autotuner/data/lenet/op_cost.txt"
+  },
+  "alexnet_hpvm": {
+    "model_name": "alexnet_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/alexnet_cifar10/",
+    "layer_file": "autotuner/data/alexnet/alexnet_layers.txt",
+    "cost_file": "autotuner/data/alexnet/op_cost.txt"
+  },
+  "alexnet2_hpvm": {
+    "model_name": "alexnet2_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/alexnet2_cifar10/",
+    "layer_file": "autotuner/data/alexnet2/alexnet2_layers.txt",
+    "cost_file": "autotuner/data/alexnet2/op_cost.txt"
+  },
+  "vgg16_cifar10_hpvm": {
+    "model_name": "vgg16_cifar10_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/vgg16_cifar10/",
+    "layer_file": "autotuner/data/vgg16_cifar10/vgg16_layers.txt",
+    "cost_file": "autotuner/data/vgg16_cifar10/op_cost.txt"
+  },
+  "vgg16_cifar100_hpvm": {
+    "model_name": "vgg16_cifar100_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/vgg16_cifar100/",
+    "layer_file": "autotuner/data/vgg16_cifar100/vgg16_layers.txt",
+    "cost_file": "autotuner/data/vgg16_cifar100/op_cost.txt"
+  },
+  "vgg16_imagenet_hpvm": {
+    "model_name": "vgg16_imagenet_hpvm",
+    "autotuner_runs": 20000,
+    "base_dir": "tuner_results/vgg16_imagenet/",
+    "layer_file": "autotuner/data/vgg16_imagenet/vgg16_layers.txt",
+    "cost_file": "autotuner/data/vgg16_imagenet/op_cost.txt"
+  },
+  "resnet18_hpvm": {
+    "model_name": "resnet18_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/resnet18_cifar10/",
+    "layer_file": "autotuner/data/resnet/resnet_layers.txt",
+    "cost_file": "autotuner/data/resnet/op_cost.txt"
+  },
+  "resnet50_imagenet_hpvm": {
+    "model_name": "resnet50_imagenet_hpvm",
+    "autotuner_runs": 30000,
+    "base_dir": "tuner_results/resnet50_imagenet/",
+    "layer_file": "autotuner/data/resnet50_imagenet/resnet50_layers.txt",
+    "cost_file": "autotuner/data/resnet50_imagenet/op_cost.txt"
+  },
+  "mobilenet_hpvm": {
+    "model_name": "mobilenet_hpvm",
+    "autotuner_runs": 20000,
+    "base_dir": "tuner_results/mobilenet/",
+    "layer_file": "autotuner/data/mobilenet/mobilenet_layer_comp.txt",
+    "cost_file": "autotuner/data/mobilenet/op_cost.txt"
+  },
+  "__unused_mobilenet_shallow": {
+    "model_name": "mobilenet_shallow_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/mobilenet_shallow/",
+    "layer_file": "autotuner/data/mobilenet_shallow/mobilenet_shallow_layer_comp.txt",
+    "cost_file": "autotuner/data/mobilenet_shallow/op_cost.txt"
+  },
+  "alexnet_imagenet_hpvm": {
+    "model_name": "alexnet_imagenet_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/alexnet_imagenet/",
+    "layer_file": "autotuner/data/alexnet_imagenet/layer_composition.txt",
+    "cost_file": "autotuner/data/alexnet_imagenet/op_cost.txt"
+  },
+  "alexnet2_canny_hpvm": {
+    "model_name": "alexnet2_canny_hpvm",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/alexnet2_canny_hpvm/",
+    "layer_file": "autotuner/data/alexnet2_canny_hpvm/layers.txt",
+    "cost_file": "autotuner/data/alexnet2_canny_hpvm/op_cost.txt"
+  },
+  "resnet18_torch": {
+    "model_name": "resnet18_torch",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/resnet18_cifar10_torch/",
+    "layer_file": "autotuner/data/resnet18_torch/resnet_layers.txt",
+    "cost_file": "autotuner/data/resnet18_torch/op_cost.txt"
+  },
+  "vgg16_torch": {
+    "model_name": "vgg16_torch",
+    "autotuner_runs": 10000,
+    "base_dir": "tuner_results/resnet18_cifar10_torch/",
+    "layer_file": "autotuner/data/resnet/resnet_layers.txt",
+    "cost_file": "autotuner/data/resnet/op_cost.txt"
+  }
+}
\ No newline at end of file
diff --git a/llvm/projects/pred_tuner/utils/config.py b/llvm/projects/pred_tuner/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..fced1a4d462ad9bb4c828f2bbc264bb4b4755081
--- /dev/null
+++ b/llvm/projects/pred_tuner/utils/config.py
@@ -0,0 +1,318 @@
+from pathlib import Path
+from typing import Dict, Iterable, List, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from models.domains import QoS
+from models.domains.qoses import Accuracy, AccuracyPSNR
+from .utils import get_knob_config_file
+
+op_mapping = {
+    "conv": "conv", "depthwise_conv": "group_conv", "dense": "mul", "batchnorm": "batchnorm",
+    "pool": "pool_max", "pool_mean": "pool_mean", "activation": "relu", "tanh": "tanh", "add": "add",
+    "reduce": "red_samp"
+}
+
+approx_map = {}
+PathLike = Union[str, Path]
+
+
+def initializeApproxMap(knobs_file_path):
+    f = open(knobs_file_path, "r")
+
+    for x in f:
+        toks = x.split("\t")
+        approx_type = toks[0].split(",")[0]
+        knob_id = toks[0].split(",")[1]
+        approx_str = approx_type + " " + knob_id
+        approx_map[knob_id] = approx_str
+
+
+initializeApproxMap(get_knob_config_file())
+
+# TODO: fix hardcoding
+fp32_to_fp16 = {
+    **{k: k + 30 for k in range(121, 138 + 1)},
+    **{k: k + 30 for k in range(231, 248 + 1)},
+    11: 12
+}
+fp16_to_fp32 = {v: k for k, v in fp32_to_fp16.items()}
+
+
+class Config:
+    def __init__(
+            self, avg_accuracy: QoS, baseline_accuracy: QoS, fname: str, flags: List[int],
+            total_runs: int, confidence: float, config_cost: float, speedup: float
+    ):
+        self.total_runs = total_runs
+        self.confidence = confidence
+        self.config_cost = config_cost
+        self.speedup = speedup
+        self.avg_qos = avg_accuracy
+        self.baseline_qos = baseline_accuracy
+        self.fname = fname
+        self.flags = flags
+        self.avg_loss = self.avg_loss.min_positive_loss()
+
+    @property
+    def avg_loss(self):
+        return self.baseline_qos - self.avg_qos
+
+    @avg_loss.setter
+    def avg_loss(self, value: QoS):
+        self.avg_qos = self.baseline_qos - value
+
+    def __repr__(self):
+        return repr((self.fname, self.speedup, self.avg_qos, self.avg_loss, self.flags))
+
+    @staticmethod
+    def qos_speedup_points(configs: Iterable['Config']) -> np.ndarray:
+        return np.array([[*conf.avg_qos.numpy(), conf.speedup] for conf in configs])
+
+    def update_acc(self, acc: QoS, confidence: float, baseline_acc: QoS = None):
+        if baseline_acc:
+            self.baseline_qos = baseline_acc
+        self.avg_qos = acc
+        self.avg_loss = self.avg_loss.min_positive_loss()
+        self.confidence = confidence
+
+    def to_fp16(self) -> 'Config':
+        import copy
+        fp16_conf = copy.copy(self)
+        fp16_conf.flags = [fp32_to_fp16.get(x, x) for x in self.flags]
+        return fp16_conf
+
+    def to_fp32(self) -> 'Config':
+        import copy
+        fp32_conf = copy.copy(self)
+        fp32_conf.flags = [fp16_to_fp32.get(x, x) for x in self.flags]
+        return fp32_conf
+
+    def to_rt_format(self, idx: int, bench_layer_composition, hardware_target: str):
+        config_str = build_config_str(self.flags, bench_layer_composition, hardware_target)
+        return (
+            "+++++\n"
+            f"conf{idx} {self.speedup} 0 {self.avg_qos} {self.avg_loss}\n"
+            f"{config_str}"
+            "-----\n"
+        )
+
+    def to_tuner_format(self):
+        topline = (
+            f"total_runs={self.total_runs}\tconfidence={self.confidence}\t"
+            f"avg_accuracy={self.avg_qos}\tconfig_cost={self.config_cost}\tspeedup={self.speedup}"
+        )
+        flags_lines = [str(x) for x in self.flags]
+        return '\n'.join([topline] + flags_lines)
+
+    @classmethod
+    def from_tuner_format(cls, lines: List[str], fname: str, baseline_accuracy: QoS):
+        def parseTopLine(x: str) -> Dict[str, str]:
+            toks = x.split()
+            fields = {}
+            for tok in toks:
+                field, value = tok.split('=')
+                fields[field] = value
+            return fields
+
+        top_line = parseTopLine(lines[0])
+        total_runs = int(top_line['total_runs'])
+        confidence = float(top_line['confidence'])
+        avg_accuracy = baseline_accuracy.parse(top_line['avg_accuracy'])
+        config_cost = float(top_line['config_cost'])
+        speedup = float(top_line['speedup'])
+        flags = [int(line.strip()) for line in lines[1:] if line.strip()]
+        return cls(avg_accuracy, baseline_accuracy, fname, flags, total_runs, confidence, config_cost, speedup)
+
+
+def genScatterPlotFromConfigs(configs, file_path):
+    speedups, accuracy_losses = [c.speedup for c in configs], [c.avg_loss for c in configs]
+    plt.scatter(accuracy_losses, speedups)
+    plt.xlabel("accuracy_loss")
+    plt.ylabel("speedup")
+    plt.xlim(left=-0.05)
+    plt.ylim(bottom=1)
+    plt.savefig(file_path)
+    plt.close()
+
+
+def _find_distance_to(points: np.ndarray, ref_points: np.ndarray) -> np.ndarray:
+    n_ref = len(ref_points)
+    if n_ref == 0:
+        return np.zeros(0)
+    if n_ref == 1:
+        return np.linalg.norm(points - ref_points, axis=1)
+    ref_points = np.array(sorted(ref_points, key=lambda p: p[0]))
+    px = points.T[0]
+    rx = ref_points.T[0]
+    local_unit_vecs = ref_points[1:] - ref_points[:-1]
+    dists = []
+    bins = np.digitize(px, rx) - 1
+    for point, left_ref_p in zip(points, bins):
+        if left_ref_p == -1:
+            left_ref_p = 0
+        to_left_ref = ref_points[left_ref_p] - point
+        local_unit_vec = local_unit_vecs[-1] if left_ref_p >= n_ref - 1 else local_unit_vecs[left_ref_p]
+        projection = np.dot(local_unit_vec, to_left_ref) / np.linalg.norm(local_unit_vec)
+        dist = np.sqrt(np.linalg.norm(to_left_ref) ** 2 - projection ** 2)
+        dists.append(dist)
+    return np.array(dists)
+
+
+def is_pareto_efficient(
+        configs: List[Config], margin: float = None,
+        ratio: float = None, n_min: int = None, n_max: int = None
+) -> List[Config]:
+    configs = np.array(configs)
+    acc_speedup = Config.qos_speedup_points(configs)
+    is_efficient = np.ones(acc_speedup.shape[0], dtype=bool)
+    for idx, c in enumerate(acc_speedup):
+        if is_efficient[idx]:
+            # Keep any point with a higher value
+            is_efficient[is_efficient] = np.any(acc_speedup[is_efficient] > c, axis=1)
+            is_efficient[idx] = True  # And keep self
+    pareto_acc_speedup = acc_speedup[is_efficient]
+    pareto_configs = configs[is_efficient]
+    non_pareto_acc_speedup = acc_speedup[np.logical_not(is_efficient)]
+    non_pareto_configs = configs[np.logical_not(is_efficient)]
+    dist_to_pareto = _find_distance_to(non_pareto_acc_speedup, pareto_acc_speedup)
+    if margin is not None:
+        marginal_accepted = non_pareto_configs[dist_to_pareto < margin]
+    elif ratio is not None:
+        dist_order = np.argsort(dist_to_pareto)
+        take_n = int(len(dist_to_pareto) * ratio)
+        if n_min is not None:
+            take_n = max(take_n, n_min)
+        if n_max is not None:
+            take_n = min(take_n, n_max)
+        take_n -= len(pareto_configs)
+        marginal_accepted = non_pareto_configs[dist_order[:take_n]]
+    else:
+        raise ValueError("Must provide margin or ratio")
+    return pareto_configs.tolist() + marginal_accepted.tolist()
+
+
+def print_layer_info(flag: int, hardware_target: str, layer_comp):
+    approx_tech = approx_map[str(flag)]
+    if flag <= 7:
+        # If is PROMISE
+        return f"promise {approx_tech}"
+    # If is GPU / CPU
+    op0 = op_mapping[layer_comp[0]]
+    config_str = f"{hardware_target} {op0} {approx_tech} "
+    for op in layer_comp[1:]:
+        op_name = op_mapping[op]
+        fp = "fp32" if is_fp32(flag) else "fp16"
+        config_str += f"{op_name} {fp} 1 "
+    return config_str
+
+
+def build_config_str(flags: List[int], layer_desc: List[List[str]], hardware_target: str):
+    lines = []
+    assert len(flags) == len(layer_desc)
+    for index, (flag, layer_comp) in enumerate(zip(flags, layer_desc), start=1):
+        layer_str = print_layer_info(flag, hardware_target, layer_comp)
+        config_str = f"{index} {layer_str}"
+        lines.append(config_str)
+    lines.append(f"{len(layer_desc) + 1} {hardware_target} softmax fp32 1\n")
+    return '\n'.join(lines)
+
+
+def is_fp32(flag: int):
+    return flag in fp32_to_fp16
+
+
+def dump_configs_to_rt(
+        layer_desc, configs: List[Config],
+        config_out_path: PathLike, baseline_acc: QoS, hardware_target: str
+):
+    baseline_flag = 11
+    baseline_config = Config(
+        baseline_acc, baseline_acc, '', [baseline_flag for _ in layer_desc],
+        1, 100.0, 0.0, 1.0
+    )
+    baseline_str = baseline_config.to_rt_format(1, layer_desc, hardware_target)
+    with config_out_path.open("w") as f:
+        f.write(baseline_str)
+        for it, config in enumerate(configs, start=2):
+            f.write(config.to_rt_format(it, layer_desc, hardware_target))
+
+
+# Public Interfaces
+def dump_rt_format_to(
+        layer_desc, configs: List[Config], gold_acc: QoS,
+        rt_cpu_path: PathLike = None, rt_gpu_path: PathLike = None
+):
+    if configs:
+        assert len(set([conf.baseline_qos for conf in configs])) == 1
+    # Sort configs
+    sorted_configs = sorted(configs, key=lambda conf: (conf.avg_loss, conf.speedup, conf.flags))
+    if rt_gpu_path is not None:
+        # Remap to fp16 for gpu.
+        fp16_configs = [conf.to_fp16() for conf in sorted_configs]
+        dump_configs_to_rt(
+            layer_desc, fp16_configs, rt_gpu_path, gold_acc, 'gpu'
+        )
+    if rt_cpu_path is not None:
+        # Remap to fp32 for cpu.
+        fp32_configs = [conf.to_fp32() for conf in sorted_configs]
+        dump_configs_to_rt(
+            layer_desc, fp32_configs, rt_cpu_path, gold_acc, 'cpu'
+        )
+
+
+def plot_configs(file_path: Path, **kw_configs: List[Config]):
+    from mpl_toolkits.mplot3d import Axes3D
+    # Decide 2D or 3D plot:
+    qos_type = None
+    for label, confs in kw_configs.items():
+        if not confs:
+            continue
+        if not qos_type:
+            qos_type = type(confs[0].avg_qos)
+        else:
+            assert qos_type == type(confs[0].avg_qos)
+    if qos_type is None:
+        return
+    if qos_type is AccuracyPSNR:
+        fig: plt.Figure = plt.figure()
+        ax: Axes3D = fig.add_subplot(111, projection='3d')
+        for label, confs in kw_configs.items():
+            data = np.array([
+                [c.avg_loss.qoses[0].to_scalar(), c.avg_qos.qoses[1].to_scalar(), c.speedup]
+                for c in confs]
+            )
+            x, y, z = data.T
+            ax.scatter(x, y, z, label=label)
+        ax.set_xlabel("accuracy_loss")
+        ax.set_ylabel("psnr")
+        ax.set_zlabel("speedup")
+        ax.set_xlim(left=-0.05)
+        ax.set_zlim(bottom=1)
+    elif qos_type is Accuracy:
+        fig, ax = plt.subplots()
+        fig: plt.Figure
+        ax: plt.Axes
+        for label, confs in kw_configs.items():
+            data = np.array([[c.avg_loss.to_scalar(), c.speedup] for c in confs])
+            x, y = data.T
+            ax.scatter(x, y, label=label)
+        ax.set_xlabel("accuracy_loss")
+        ax.set_ylabel("speedup")
+        ax.set_xlim(left=-0.05)
+        ax.set_ylim(bottom=1)
+    else:
+        raise ValueError(f"QoS type {qos_type} unsupported in plotting.")
+    ax.legend()
+    fig.savefig(file_path)
+    plt.close(fig)
+
+
+def load_configs_from_dir(result_dir: PathLike, baseline_accuracy: QoS):
+    config_arr = []
+    for path in Path(result_dir).glob('*'):
+        with path.open() as f:
+            lines = f.readlines()
+        config_arr.append(Config.from_tuner_format(lines, path.name, baseline_accuracy))
+    return config_arr
diff --git a/llvm/projects/pred_tuner/utils/logging.py b/llvm/projects/pred_tuner/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6904bd2e0a0683ccc6905994f645fa6856ad4d
--- /dev/null
+++ b/llvm/projects/pred_tuner/utils/logging.py
@@ -0,0 +1,87 @@
+import logging
+from logging import config
+import os
+from pathlib import Path
+
+import tqdm
+
+
+class TqdmStreamHandler(logging.Handler):
+    """tqdm-friendly logging handler. Uses tqdm.write instead of print for logging."""
+
+    def __init__(self, level=logging.NOTSET):
+        super().__init__(level)
+
+    def emit(self, record):
+        try:
+            msg = self.format(record)
+            tqdm.tqdm.write(msg)
+            self.flush()
+        except (KeyboardInterrupt, SystemExit, RecursionError):
+            raise
+        except:
+            self.handleError(record)
+
+
+_last_applied_config = None
+
+
+def config_pylogger(filename: str = None, output_dir: Path = None, verbose: bool = False) -> logging.Logger:
+    """Configure the Python logger.
+
+    For each execution of the application, we'd like to create a unique log file.
+    By default this file is named using the date and time of day, so that it can be sorted by recency.
+    You can also name your filename or choose the log directory.
+    """
+    import time
+    timestr = time.strftime("%Y.%m.%d-%H%M%S")
+    filename = filename or timestr
+    output_dir = output_dir or Path('.')
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = output_dir / filename
+
+    global _last_applied_config
+    _last_applied_config = d = {
+        'version': 1,
+        'disable_existing_loggers': False,
+        'formatters': {
+            'simple': {
+                'format': '%(levelname)s %(name)s: '
+                          '%(message)s'
+            },
+            'detailed': {
+                'format': '[%(asctime)-15s] '
+                          '%(levelname)7s %(name)s: '
+                          '%(message)s '
+                          '@%(filename)s:%(lineno)d'
+            }
+        },
+        'handlers': {
+            'console': {
+                '()': TqdmStreamHandler,
+                'level': 'INFO',
+                'formatter': 'simple'
+            },
+            'file': {
+                'class': 'logging.FileHandler',
+                'filename': file_path.as_posix(),
+                'mode': 'a',  # Because we may apply this config again, want to keep existing content
+                'formatter': 'detailed',
+            },
+        },
+        'root': {
+            'level': 'DEBUG' if verbose else 'INFO',
+            'handlers': ['console', 'file']
+        },
+    }
+    config.dictConfig(d)
+
+    msglogger = logging.getLogger()
+    msglogger.info(f"Log file for this run: {file_path}")
+    return msglogger
+
+
+def reapply_last_config():
+    if _last_applied_config is not None:
+        config.dictConfig(_last_applied_config)
diff --git a/llvm/projects/pred_tuner/utils/utils.py b/llvm/projects/pred_tuner/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..16165574662ca91320784f827468002fbae21fa8
--- /dev/null
+++ b/llvm/projects/pred_tuner/utils/utils.py
@@ -0,0 +1,26 @@
+import logging
+import os
+from pathlib import Path
+
+import torch
+
+device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available() else 'cpu'
+n_cpu_threads = 12 if device == 'cuda:0' else 35
+torch.set_num_threads(n_cpu_threads)
+
+msg_logger = logging.getLogger(__name__)
+
+
+def gpu_mem_mb():
+    # noinspection PyTypeChecker
+    return torch.cuda.memory_allocated(device) / 1024 ** 2
+
+
+def get_tensorrt_dir() -> Path:
+    if 'LLVM_SRC_ROOT' not in os.environ:
+        return Path('.')
+    return Path(os.environ['LLVM_SRC_ROOT']) / "projects/hpvm-tensor-rt"
+
+
+def get_knob_config_file() -> Path:
+    return get_tensorrt_dir() / "autotuner/data/global_knobs.txt"