From e65ec8fce890049fb421aa7d9d32cad5b075cd87 Mon Sep 17 00:00:00 2001
From: Guy Jacob <guy.jacob@intel.com>
Date: Wed, 7 Aug 2019 13:42:59 +0300
Subject: [PATCH] Add experiment details in AlexNet BN yamls (FP32 and DoReFa)

---
 .../fp32_baselines/alexnet_bn_base_fp32.yaml  | 26 ++++++++++++++
 .../quant_aware_train/alexnet_bn_dorefa.yaml  | 34 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/examples/quantization/fp32_baselines/alexnet_bn_base_fp32.yaml b/examples/quantization/fp32_baselines/alexnet_bn_base_fp32.yaml
index 89b5723..a48f1a5 100644
--- a/examples/quantization/fp32_baselines/alexnet_bn_base_fp32.yaml
+++ b/examples/quantization/fp32_baselines/alexnet_bn_base_fp32.yaml
@@ -1,3 +1,29 @@
+# Scheduler for training a FP32 baseline of AlexNet with Batch-Norm
+#
+# IMPORTANT NOTES:
+# ----------------
+#  1. Pay attention that this is not the original AlexNet, but AlexNet w. batch normalization layers.
+#     See model implementation in <distiller-root>/distiller/models/imagenet/alexnet_batchnorm.py
+#  2. The best results are achieved with the Adam optimizer. As is, the optimizer used in the image classification
+#     sample is SGD and this is not configurable. So we need to edit the code:
+#     * Open <distiller-root>/distiller/apputils/image_classifier.py
+#     * In the function "_init_learner(args)", find the following snippet:
+#         optimizer = torch.optim.SGD(model.parameters(),
+#             lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+#       And replace it with:
+#         optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+#
+# Command line for training (running from the compress_classifier.py directory):
+# python compress_classifier.py --arch alexnet_bn <path_to_imagenet_dataset> -p=50 --epochs=110 --compress=../quantization/fp32_baselines/alexnet_bn_base_fp32.yaml -j 22 --lr 0.0002 --wd 0.0001 --vs 0
+#
+# After 110 epochs we get:
+# --- test ---------------------
+# 50000 samples (256 per mini-batch)
+# Test: [   50/  195]    Loss 1.608579    Top1 61.757812    Top5 83.789062
+# Test: [  100/  195]    Loss 1.605091    Top1 61.964844    Top5 83.988281
+# Test: [  150/  195]    Loss 1.612654    Top1 61.950521    Top5 83.864583
+# ==> Top1: 61.914    Top5: 83.838    Loss: 1.618
+
 lr_schedulers:
   training_lr:
     class: MultiStepLR
diff --git a/examples/quantization/quant_aware_train/alexnet_bn_dorefa.yaml b/examples/quantization/quant_aware_train/alexnet_bn_dorefa.yaml
index 7a955d2..36d1927 100644
--- a/examples/quantization/quant_aware_train/alexnet_bn_dorefa.yaml
+++ b/examples/quantization/quant_aware_train/alexnet_bn_dorefa.yaml
@@ -1,8 +1,40 @@
+# Scheduler for training AlexNet with Batch-Norm, quantized using the DoReFa scheme
+# Activations: 8-bits, Weights: 4-bits
+# See:
+#  https://nervanasystems.github.io/distiller/algo_quantization/index.html#dorefa
+#  https://arxiv.org/abs/1606.06160
+#
+# IMPORTANT NOTES:
+# ----------------
+#  1. Pay attention that this is not the original AlexNet, but AlexNet w. batch normalization layers.
+#     See model implementation in <distiller-root>/distiller/models/imagenet/alexnet_batchnorm.py
+#  2. The best results are achieved with the Adam optimizer. As is, the optimizer used in the image classification
+#     sample is SGD and this is not configurable. So we need to edit the code:
+#     * Open <distiller-root>/distiller/apputils/image_classifier.py
+#     * In the function "_init_learner(args)", find the following snippet:
+#         optimizer = torch.optim.SGD(model.parameters(),
+#             lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+#       And replace it with:
+#         optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+#
+# Command line for training (running from the compress_classifier.py directory):
+# python compress_classifier.py --arch alexnet_bn <path_to_imagenet_dataset> -p=50 --epochs=110 --compress=../quantization/quant_aware_train/alexnet_bn_dorefa.yaml -j 22 --lr 0.0002 --wd 0.0001 --vs 0
+#
+# After 110 epochs we get:
+# --- test ---------------------
+# 50000 samples (256 per mini-batch)
+# Test: [   50/  195]    Loss 1.645975    Top1 61.453125    Top5 83.437500
+# Test: [  100/  195]    Loss 1.635810    Top1 61.507812    Top5 83.445312
+# Test: [  150/  195]    Loss 1.633975    Top1 61.479167    Top5 83.419271
+# ==> Top1: 61.498    Top5: 83.454    Loss: 1.635
+#
+# So that's 61.498%, compared to 61.914% for the FP32 model
+
 quantizers:
   dorefa_quantizer:
     class: DorefaQuantizer
     bits_activations: 8
-    bits_weights: 3
+    bits_weights: 4
     overrides:
     # Don't quantize first and last layer
       features.0:
-- 
GitLab