diff --git a/examples/agp-pruning/word_lang_model.LARGE_70.schedule_agp.yaml b/examples/agp-pruning/word_lang_model.LARGE_70.schedule_agp.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e5958314024b6269d99c4a3413ac4f960b22e8c8 --- /dev/null +++ b/examples/agp-pruning/word_lang_model.LARGE_70.schedule_agp.yaml @@ -0,0 +1,57 @@ +# Fine grained (element-wise) pruning using Automated Gradual Pruner scheduling for PyTorch's example Word Language model. +# +# The README of PyTorch's word language model example code, promises that this configuration will produce a Test perplexity +# of 72.30, while I was only able to get 84.23, so I use that as the baseline for comparison. +# +# time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied +# +# ========================================================================================= +# | End of training | test loss 4.43 | test ppl 84.23 +# ========================================================================================= +# +# To save you time, you can download a pretrained model from here: +# https://s3-us-west-1.amazonaws.com/nndistiller/agp-pruning/word_language_model/model.emsize1500.nhid1500.dropout065.tied.pt +# +# With the same configuration, and the pruning schedule below, we get comparable perplexity results: +# +# python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied --compress=../../examples/agp-pruning/word_lang_model.schedule_agp.yaml +# + +version: 1 +pruners: + l0_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.70 + weights: [rnn.weight_ih_l0, rnn.weight_hh_l0] + + l1_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.70 + weights: [rnn.weight_ih_l1, rnn.weight_hh_l1] + + embedding_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.70 + weights: [encoder.weight] + +policies: + - pruner: + instance_name : l0_rnn_pruner + starting_epoch: 2 + ending_epoch: 20 + frequency: 1 + + - pruner: + instance_name : l1_rnn_pruner + starting_epoch: 2 + ending_epoch: 20 + frequency: 1 + + - pruner: + instance_name : embedding_pruner + starting_epoch: 3 + ending_epoch: 21 + frequency: 1 diff --git a/examples/agp-pruning/word_lang_model.LARGE_70B.schedule_agp.yaml b/examples/agp-pruning/word_lang_model.LARGE_70B.schedule_agp.yaml new file mode 100755 index 0000000000000000000000000000000000000000..bf3a53f1dd4bca350306a93d0a7f5dcdbf7195f3 --- /dev/null +++ b/examples/agp-pruning/word_lang_model.LARGE_70B.schedule_agp.yaml @@ -0,0 +1,57 @@ +# Fine grained (element-wise) pruning using Automated Gradual Pruner scheduling for PyTorch's example Word Language model. +# +# The README of PyTorch's word language model example code, promises that this configuration will produce a Test perplexity +# of 72.30, while I was only able to get 84.23, so I use that as the baseline for comparison. +# +# time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied +# +# ========================================================================================= +# | End of training | test loss 4.43 | test ppl 84.23 +# ========================================================================================= +# +# To save you time, you can download a pretrained model from here: +# https://s3-us-west-1.amazonaws.com/nndistiller/agp-pruning/word_language_model/model.emsize1500.nhid1500.dropout065.tied.pt +# +# With the same configuration, and the pruning schedule below, we get comparable perplexity results: +# +# python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied --compress=../../examples/agp-pruning/word_lang_model.schedule_agp.yaml +# + +version: 1 +pruners: + l0_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.50 + weights: [rnn.weight_ih_l0, rnn.weight_hh_l0] + + l1_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.50 + weights: [rnn.weight_ih_l1, rnn.weight_hh_l1] + + embedding_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.85 + weights: [encoder.weight] + +policies: + - pruner: + instance_name : l0_rnn_pruner + starting_epoch: 1 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : l1_rnn_pruner + starting_epoch: 2 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : embedding_pruner + starting_epoch: 3 + ending_epoch: 26 + frequency: 3 diff --git a/examples/agp-pruning/word_lang_model.LARGE_80.schedule_agp.yaml b/examples/agp-pruning/word_lang_model.LARGE_80.schedule_agp.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d86eedd71980c9e99be3baba79ef267a1d31a514 --- /dev/null +++ b/examples/agp-pruning/word_lang_model.LARGE_80.schedule_agp.yaml @@ -0,0 +1,57 @@ +# Fine grained (element-wise) pruning using Automated Gradual Pruner scheduling for PyTorch's example Word Language model. +# +# The README of PyTorch's word language model example code, promises that this configuration will produce a Test perplexity +# of 72.30, while I was only able to get 84.23, so I use that as the baseline for comparison. +# +# time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied +# +# ========================================================================================= +# | End of training | test loss 4.43 | test ppl 84.23 +# ========================================================================================= +# +# To save you time, you can download a pretrained model from here: +# https://s3-us-west-1.amazonaws.com/nndistiller/agp-pruning/word_language_model/model.emsize1500.nhid1500.dropout065.tied.pt +# +# With the same configuration, and the pruning schedule below, we get comparable perplexity results: +# +# python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied --compress=../../examples/agp-pruning/word_lang_model.schedule_agp.yaml +# + +version: 1 +pruners: + l0_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.74 + weights: [rnn.weight_ih_l0, rnn.weight_hh_l0] + + l1_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.74 + weights: [rnn.weight_ih_l1, rnn.weight_hh_l1] + + embedding_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.85 + weights: [encoder.weight] + +policies: + - pruner: + instance_name : l0_rnn_pruner + starting_epoch: 1 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : l1_rnn_pruner + starting_epoch: 2 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : embedding_pruner + starting_epoch: 3 + ending_epoch: 26 + frequency: 3 diff --git a/examples/agp-pruning/word_lang_model.LARGE_90.schedule_agp.yaml b/examples/agp-pruning/word_lang_model.LARGE_90.schedule_agp.yaml new file mode 100755 index 0000000000000000000000000000000000000000..307233199be9cc22d8cba0db0af0cbef7e034065 --- /dev/null +++ b/examples/agp-pruning/word_lang_model.LARGE_90.schedule_agp.yaml @@ -0,0 +1,57 @@ +# Fine grained (element-wise) pruning using Automated Gradual Pruner scheduling for PyTorch's example Word Language model. +# +# The README of PyTorch's word language model example code, promises that this configuration will produce a Test perplexity +# of 72.30, while I was only able to get 84.23, so I use that as the baseline for comparison. +# +# time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied +# +# ========================================================================================= +# | End of training | test loss 4.43 | test ppl 84.23 +# ========================================================================================= +# +# To save you time, you can download a pretrained model from here: +# https://s3-us-west-1.amazonaws.com/nndistiller/agp-pruning/word_language_model/model.emsize1500.nhid1500.dropout065.tied.pt +# +# With the same configuration, and the pruning schedule below, we get comparable perplexity results: +# +# python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied --compress=../../examples/agp-pruning/word_lang_model.schedule_agp.yaml +# + +version: 1 +pruners: + l0_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.90 + weights: [rnn.weight_ih_l0, rnn.weight_hh_l0] + + l1_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.90 + weights: [rnn.weight_ih_l1, rnn.weight_hh_l1] + + embedding_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.90 + weights: [encoder.weight] + +policies: + - pruner: + instance_name : l0_rnn_pruner + starting_epoch: 1 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : l1_rnn_pruner + starting_epoch: 2 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : embedding_pruner + starting_epoch: 3 + ending_epoch: 26 + frequency: 3 diff --git a/examples/agp-pruning/word_lang_model.LARGE_95.schedule_agp.yaml b/examples/agp-pruning/word_lang_model.LARGE_95.schedule_agp.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e2e9c4c235c700b9319dbee8ae19f51642a7aeb1 --- /dev/null +++ b/examples/agp-pruning/word_lang_model.LARGE_95.schedule_agp.yaml @@ -0,0 +1,57 @@ +# Fine grained (element-wise) pruning using Automated Gradual Pruner scheduling for PyTorch's example Word Language model. +# +# The README of PyTorch's word language model example code, promises that this configuration will produce a Test perplexity +# of 72.30, while I was only able to get 84.23, so I use that as the baseline for comparison. +# +# time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied +# +# ========================================================================================= +# | End of training | test loss 4.43 | test ppl 84.23 +# ========================================================================================= +# +# To save you time, you can download a pretrained model from here: +# https://s3-us-west-1.amazonaws.com/nndistiller/agp-pruning/word_language_model/model.emsize1500.nhid1500.dropout065.tied.pt +# +# With the same configuration, and the pruning schedule below, we get comparable perplexity results: +# +# python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied --compress=../../examples/agp-pruning/word_lang_model.schedule_agp.yaml +# + +version: 1 +pruners: + l0_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.95 + weights: [rnn.weight_ih_l0, rnn.weight_hh_l0] + + l1_rnn_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.95 + weights: [rnn.weight_ih_l1, rnn.weight_hh_l1] + + embedding_pruner: + class: AutomatedGradualPruner + initial_sparsity : 0.05 + final_sparsity: 0.95 + weights: [encoder.weight] + +policies: + - pruner: + instance_name : l0_rnn_pruner + starting_epoch: 1 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : l1_rnn_pruner + starting_epoch: 2 + ending_epoch: 25 + frequency: 3 + + - pruner: + instance_name : embedding_pruner + starting_epoch: 3 + ending_epoch: 26 + frequency: 3