diff --git a/README.md b/README.md index ae4cd42ff147e94177fb225d1ed819bd59fd0a18..1499d123f6030c15f0bedaef0dbdad2ab2998083 100755 --- a/README.md +++ b/README.md @@ -72,8 +72,9 @@ Highlighted features: - The compression schedule is expressed in a YAML file so that a single file captures the details of experiments. This [dependency injection](https://en.wikipedia.org/wiki/Dependency_injection) design decouples the Distiller scheduler and library from future extensions of algorithms. * Quantization: - Automatic mechanism to transform existing models to quantized versions, with customizable bit-width configuration for different layers. No need to re-write the model for different quantization methods. - - Support for training with quantization in the loop + - Support for [training with quantization](https://nervanasystems.github.io/distiller/quantization/index.html#training-with-quantization) in the loop - One-shot 8-bit quantization of trained full-precision models +* Training with [knowledge distillation](https://nervanasystems.github.io/distiller/knowledge_distillation/index.html), in conjunction with the other available pruning / regularization / quantization methods. * Export statistics summaries using Pandas dataframes, which makes it easy to slice, query, display and graph the data. * A set of [Jupyter notebooks](https://nervanasystems.github.io/distiller/jupyter/index.html) to plan experiments and analyze compression results. The graphs and visualizations you see on this page originate from the included Jupyter notebooks. + Take a look at [this notebook](https://github.com/NervanaSystems/distiller/blob/master/jupyter/alexnet_insights.ipynb), which compares visual aspects of dense and sparse Alexnet models. diff --git a/distiller/__init__.py b/distiller/__init__.py index 6b0c21fa4da047803752ee32893c620d0efd5380..708d6288f5bcff7548095bdd255cb3bee332f9cd 100755 --- a/distiller/__init__.py +++ b/distiller/__init__.py @@ -23,6 +23,7 @@ from .sensitivity import * from .directives import * from .policy import * from .thinning import * +from .knowledge_distillation import KnowledgeDistillationPolicy, DistillationLossWeights #del utils del dict_config diff --git a/distiller/knowledge_distillation.py b/distiller/knowledge_distillation.py new file mode 100644 index 0000000000000000000000000000000000000000..20849cc1e6defa00251bcce55e334cab833581c9 --- /dev/null +++ b/distiller/knowledge_distillation.py @@ -0,0 +1,163 @@ +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn.functional as F +from collections import namedtuple + +from .policy import ScheduledTrainingPolicy, PolicyLoss, LossComponent + +DistillationLossWeights = namedtuple('DistillationLossWeights', + ['distill', 'student', 'teacher']) + + +def add_distillation_args(argparser, arch_choices=None, enable_pretrained=False): + """ + Helper function to make it easier to add command line arguments for knowledge distillation to any script + + Arguments: + argparser (argparse.ArgumentParser): Existing parser to which to add the arguments + arch_choices: Optional list of choices to be enforced by the parser for model selection + enable_pretrained (bool): Flag to enable/disable argument for "pre-trained" models. + """ + group = argparser.add_argument_group('Knowledge Distillation Training Arguments') + group.add_argument('--kd-teacher', choices=arch_choices, metavar='ARCH', + help='Model architecture for teacher model') + if enable_pretrained: + group.add_argument('--kd-pretrained', action='store_true', help='Use pre-trained model for teacher') + group.add_argument('--kd-resume', type=str, default='', metavar='PATH', + help='Path to checkpoint from which to load teacher weights') + group.add_argument('--kd-temperature', '--kd-temp', dest='kd_temp', type=float, default=1.0, metavar='TEMP', + help='Knowledge distillation softmax temperature') + group.add_argument('--kd-distill-wt', '--kd-dw', type=float, default=0.5, metavar='WEIGHT', + help='Weight for distillation loss (student vs. teacher soft targets)') + group.add_argument('--kd-student-wt', '--kd-sw', type=float, default=0.5, metavar='WEIGHT', + help='Weight for student vs. labels loss') + group.add_argument('--kd-teacher-wt', '--kd-tw', type=float, default=0.0, metavar='WEIGHT', + help='Weight for teacher vs. labels loss') + group.add_argument('--kd-start-epoch', type=int, default=0, metavar='EPOCH_NUM', + help='Epoch from which to enable distillation') + + +class KnowledgeDistillationPolicy(ScheduledTrainingPolicy): + """ + Policy which enables knowledge distillation from a teacher model to a student model, as presented in [1]. + + Notes: + 1. In addition to the standard policy callbacks, this class also provides a 'forward' function that must + be called instead of calling the student model directly as is usually done. This is needed to facilitate + running the teacher model in addition to the student, and for caching the logits for loss calculation. + 2. [TO BE ENABLED IN THE NEAR FUTURE] Option to train the teacher model in parallel with the student model, + described as "scheme A" in [2]. This can be achieved by passing teacher loss weight > 0. + 3. [1] proposes a weighted average between the different losses. We allow arbitrary weights to be assigned + to each loss. + + Arguments: + student_model (nn.Module): The student model, that is - the main model being trained. If only initialized with + random weights, this matches "scheme B" in [2]. If it has been bootstrapped with trained FP32 weights, + this matches "scheme C". + teacher_model (nn.Module): The teacher model from which soft targets are generated for knowledge distillation. + Usually this is a pre-trained model, however in the future it will be possible to train this model as well + (see Note 1 above) + temperature (float): Temperature value used when calculating soft targets and logits (see [1]). + loss_weights (DistillationLossWeights): Named tuple with 3 loss weights + (a) 'distill' for student predictions (default: 0.5) vs. teacher soft-targets + (b) 'student' for student predictions vs. true labels (default: 0.5) + (c) 'teacher' for teacher predictions vs. true labels (default: 0). Currently this is just a placeholder, + and cannot be set to a non-zero value. + + [1] Hinton et al., Distilling the Knowledge in a Neural Network (https://arxiv.org/abs/1503.02531) + [2] Mishra and Marr, Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy + (https://arxiv.org/abs/1711.05852) + + """ + def __init__(self, student_model, teacher_model, temperature=1.0, + loss_weights=DistillationLossWeights(0.5, 0.5, 0)): + super(KnowledgeDistillationPolicy, self).__init__() + + if loss_weights.teacher != 0: + raise NotImplementedError('Using teacher vs. labels loss is not supported yet, ' + 'for now teacher loss weight must be set to 0') + + self.active = False + + self.student = student_model + self.teacher = teacher_model + self.temperature = temperature + self.loss_wts = loss_weights + + self.last_students_logits = None + self.last_teacher_logits = None + + def forward(self, *inputs): + """ + Performs forward propagation through both student and teached models and caches the logits. + This function MUST be used instead of calling the student model directly. + + Returns: + The student model's returned output, to be consistent with what a script using this would expect + """ + if not self.active: + return self.student(*inputs) + + if self.loss_wts.teacher == 0: + with torch.no_grad(): + self.last_teacher_logits = self.teacher(*inputs) + else: + self.last_teacher_logits = self.teacher(*inputs) + + out = self.student(*inputs) + self.last_students_logits = out.new_tensor(out, requires_grad=True) + + return out + + # Since the "forward" function isn't a policy callback, we use the epoch callbacks to toggle the + # activation of distillation according the schedule defined by the user + def on_epoch_begin(self, model, zeros_mask_dict, meta): + self.active = True + + def on_epoch_end(self, model, zeros_mask_dict, meta): + self.active = False + + def before_backward_pass(self, model, epoch, minibatch_id, minibatches_per_epoch, loss, zeros_mask_dict, + optimizer=None): + # TODO: Consider adding 'labels' as an argument to this callback, so we can support teacher vs. labels loss + # (Otherwise we can't do it with a sub-class of ScheduledTrainingPolicy) + + if not self.active: + return None + + if self.last_teacher_logits is None or self.last_students_logits is None: + raise RuntimeError("KnowledgeDistillationPolicy: Student and or teacher logits were not cached. " + "Make sure to call KnowledgeDistillationPolicy.forward() in your script instead of " + "calling the model directly.") + + # Calculate distillation loss + soft_log_probs = F.log_softmax(self.last_students_logits / self.temperature, dim=1) + # soft_targets = F.softmax(self.cached_teacher_logits[minibatch_id] / self.temperature) + soft_targets = F.softmax(self.last_teacher_logits / self.temperature, dim=1) + + # The averaging used in PyTorch KL Div implementation is wrong, so we work around as suggested in + # https://pytorch.org/docs/stable/nn.html#kldivloss + # (Also see https://github.com/pytorch/pytorch/issues/6622, https://github.com/pytorch/pytorch/issues/2259) + distillation_loss = F.kl_div(soft_log_probs, soft_targets.detach(), size_average=False) / soft_targets.shape[0] + + # The loss passed to the callback is the student's loss vs. the true labels, so we can use it directly, no + # need to calculate again + + overall_loss = self.loss_wts.student * loss + self.loss_wts.distill * distillation_loss + return PolicyLoss(overall_loss, + [LossComponent('Distill Loss', distillation_loss)]) diff --git a/docs-src/docs/imgs/knowledge_distillation.png b/docs-src/docs/imgs/knowledge_distillation.png new file mode 100644 index 0000000000000000000000000000000000000000..eb844185c9c511fba92fa48b51f9d6260a0db8cc Binary files /dev/null and b/docs-src/docs/imgs/knowledge_distillation.png differ diff --git a/docs-src/docs/knowledge_distillation.md b/docs-src/docs/knowledge_distillation.md new file mode 100644 index 0000000000000000000000000000000000000000..783801d00383788880adcda047432719ac92b341 --- /dev/null +++ b/docs-src/docs/knowledge_distillation.md @@ -0,0 +1,61 @@ +# Knowledge Distillation + +(For details on how to train a model with knowledge distillation in Distiller, see [here](schedule.md#knowledge-distillation)) + +Knowledge distillation is model compression method in which a small model is trained to mimic a pre-trained, larger model (or ensemble of models). This training setting is sometimes referred to as "teacher-student", where the large model is the teacher and the small model is the student (we'll be using these terms interchangeably). + +The method was first proposed by [Bucila et al., 2006](#bucila-et-al-2006) and generalized by [Hinton et al., 2015](#hinton-et-al-2015). The implementation in Distiller is based on the latter publication. Here we'll provide a summary of the method. For more information the reader may refer to the paper (a [video lecture](https://www.youtube.com/watch?v=EK61htlw8hY) with [slides](http://www.ttic.edu/dl/dark14.pdf) is also available). + +In distillation, knowledge is transferred from the teacher model to the student by minimizing a loss function in which the target is the distribution of class probabilities predicted by the teacher model. That is - the output of a softmax function on the teacher model's logits. However, in many cases, this probability distribution has the correct class at a very high probability, with all other class probabilities very close to 0. As such, it doesn't provide much information beyond the ground truth labels already provided in the dataset. To tackle this issue, [Hinton et al., 2015](#hinton-et-al-2015) introduced the concept of "softmax temperature". The probability \(p_i\) of class \(i\) is calculated from the logits \(z\) as: + +\[p_i = \frac{exp\left(\frac{z_i}{T}\right)}{\sum_{j} \exp\left(\frac{z_j}{T}\right)}\] + +where \(T\) is the temperature parameter. When \(T=1\) we get the standard softmax function. As \(T\) grows, the probability distribution generated by the softmax function becomes softer, providing more information as to which classes the teacher found more similar to the predicted class. Hinton calls this the "dark knowledge" embedded in the teacher model, and it is this dark knowledge that we are transferring to the student model in the distillation process. When computing the loss function vs. the teacher's soft targets, we use the same value of \(T\) to compute the softmax on the student's logits. We call this loss the "distillation loss". + +[Hinton et al., 2015](#hinton-et-al-2015) found that it is also beneficial to train the distilled model to produce the correct labels (based on the ground truth) in addition to the teacher's soft-labels. Hence, we also calculate the "standard" loss between the student's predicted class probabilities and the ground-truth labels (also called "hard labels/targets"). We dub this loss the "student loss". When calculating the class probabilities for the student loss we use \(T = 1\). + +The overall loss function, incorporating both distillation and student losses, is calculated as: + +\[\mathcal{L}(x;W) = \alpha * \mathcal{H}(y, \sigma(z_s; T=1)) + \beta * \mathcal{H}(\sigma(z_t; T=\tau), \sigma(z_s, T=\tau))\] + +where \(x\) is the input, \(W\) are the student model parameters, \(y\) is the ground truth label, \(\mathcal{H}\) is the cross-entropy loss function, \(\sigma\) is the softmax function parameterized by the temperature \(T\), and \(\alpha\) and \(\beta\) are coefficients. \(z_s\) and \(z_t\) are the logits of the student and teacher respectively. + + + +## New Hyper-Parameters + +In general \(\tau\), \(\alpha\) and \(\beta\) are hyper parameters. + +In their experiments, [Hinton et al., 2015](#hinton-et-al-2015) use temperature values ranging from 1 to 20. They note that empirically, when the student model is very small compared to the teacher model, lower temperatures work better. This makes sense if we consider that as we raise the temperature, the resulting soft-labels distribution becomes richer in information, and a very small model might not be able to capture all of this information. However, there's no clear way to predict up front what kind of capacity for information the student model will have. + +With regards to \(\alpha\) and \(\beta\), [Hinton et al., 2015](#hinton-et-al-2015) use a weighted average between the distillation loss and the student loss. That is, \(\beta = 1 - \alpha\). They note that in general, they obtained the best results when setting \(\alpha\) to be much smaller than \(\beta\) (although in one of their experiments they use \(\alpha = \beta = 0.5\)). Other works which utilize knowledge distillation don't use a weighted average. Some set \(\alpha = 1\) while leaving \(\beta\) tunable, while others don't set any constraints. + + +## <a name="combining"></a>Combining with Other Model Compression Techniques + +In the "basic" scenario, the smaller (student) model is a pre-defined architecture which just has a smaller number of parameters compared to the teacher model. For example, we could train ResNet-18 by distilling knowledge from ResNet-34. But, a model with smaller capacity can also be obtained by other model compression techniques - sparsification and/or quantization. So, for example, we could train a 4-bit ResNet-18 model with some method using quantization-aware training, and use a distillation loss function as described above. In that case, the teacher model can even be a FP32 ResNet-18 model. Same goes for pruning and regularization. + +[Tann et al., 2017](#tann-et-al-2017), [Mishra and Marr, 2018](#mishra-and-marr-2018) and [Polino et al., 2018](#polino-et-al-2018) are some works that combine knowledge distillation with **quantization**. [Theis et al., 2018](#theis-et-al-2018) and [Ashok et al., 2018](#ashok-et-al-2018) combine distillation with **pruning**. + + +## References +<div id="bucila-et-al-2006"></div> +**Cristian Bucila, Rich Caruana, and Alexandru Niculescu-Mizil**. Model Compression. [KDD, 2006](https://www.cs.cornell.edu/~caruana/compression.kdd06.pdf) + +<div id="hinton-et-al-2015"></div> +**Geoffrey Hinton, Oriol Vinyals and Jeff Dean**. Distilling the Knowledge in a Neural Network. [arxiv:1503.02531](https://arxiv.org/abs/1503.02531) + +<div id="tann-et-al-2017"></div> +**Hokchhay Tann, Soheil Hashemi, Iris Bahar and Sherief Reda**. Hardware-Software Codesign of Accurate, Multiplier-free Deep Neural Networks. [DAC, 2017](https://arxiv.org/abs/1705.04288) + +<div id="mishra-and-marr-2018"></div> +**Asit Mishra and Debbie Marr**. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. [ICLR, 2018](https://openreview.net/forum?id=B1ae1lZRb) + +<div id="polino-et-al-2018"></div> +**Antonio Polino, Razvan Pascanu and Dan Alistarh**. Model compression via distillation and quantization. [ICLR, 2018](https://openreview.net/forum?id=S1XolQbRW) + +<div id="ashok-et-al-2018"></div> +**Anubhav Ashok, Nicholas Rhinehart, Fares Beainy and Kris M. Kitani**. N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning. [ICLR, 2018](https://openreview.net/forum?id=B1hcZZ-AW) + +<div id="theis-et-al-2018"></div> +**Lucas Theis, Iryna Korshunova, Alykhan Tejani and Ferenc Huszár**. Faster gaze prediction with dense networks and Fisher pruning. [arxiv:1801.05787](https://arxiv.org/abs/1801.05787) diff --git a/docs-src/docs/quantization.md b/docs-src/docs/quantization.md index 366d12cac5eb00059a13b04a364d291d6331e7c9..6db573c25ebe715a810049123e04bd8717f3a879 100644 --- a/docs-src/docs/quantization.md +++ b/docs-src/docs/quantization.md @@ -49,7 +49,7 @@ Another possible optimization point is **scale-factor scope**. The most common w Naively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy: - **Training / Re-Training**: For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the [next section](#training-with-quantization). -[Zhou S et al., 2016](#zhou-et-al-2016) have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods *require* a trained FP32 model, either as a starting point ([Zhou A et al., 2017](#zhou-et-al-2017)), or as a teacher network in a knowledge distillation training setup ([Mishra and Marr, 2018](#mishra-and-marr-2018)). +[Zhou S et al., 2016](#zhou-et-al-2016) have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods *require* a trained FP32 model, either as a starting point ([Zhou A et al., 2017](#zhou-et-al-2017)), or as a teacher network in a knowledge distillation training setup (see [here](knowledge_distillation.md#combining)). - **Replacing the activation function**: The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used ([Zhou S et al., 2016](#zhou-et-al-2016), [Mishra et al., 2018](#mishra-et-al-2018)). Another method learns the clipping value per layer, with better results ([Choi et al., 2018](#choi-et-al-2018)). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above). - **Modifying network structure**: [Mishra et al., 2018](#mishra-et-al-2018) try to compensate for the loss of information due to quantization by using wider layers (more channels). [Lin et al., 2017](#lin-et-al-2017) proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different "base", covering a larger dynamic range overall. - **First and last layer**: Many methods do not quantize the first and last layer of the model. It has been observed by [Han et al., 2015](#han-et-al-2015) that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically ([Zhou S et al., 2016](#zhou-et-al-2016), [Choi et al., 2018](#choi-et-al-2018)). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them ([Rastegari et al., 2016](#rastegari-et-al-2016)). Most methods keep the first and last layers at FP32. However, [Choi et al., 2018](#choi-et-al-2018) showed that "conservative" quantization of these layers, e.g. to INT8, does not reduce accuracy. @@ -90,9 +90,6 @@ An important question in this context is how to back-propagate through the quant <div id="zhou-et-al-2017"></div> **Aojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen**. Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. [ICLR, 2017](https://arxiv.org/abs/1702.03044) -<div id="mishra-and-marr-2018"></div> -**Asit Mishra and Debbie Marr**. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. [ICLR, 2018](https://openreview.net/forum?id=B1ae1lZRb) - <div id="mishra-et-al-2018"></div> **Asit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr**. WRPN: Wide Reduced-Precision Networks. [ICLR, 2018](https://openreview.net/forum?id=B1ZvaaeAZ) diff --git a/docs-src/docs/schedule.md b/docs-src/docs/schedule.md index 54d5704382b270e4f5cb4f3531c9909c44e09e50..f82e136c2a131a2aab4237c0ca95326d3b53a7c8 100755 --- a/docs-src/docs/schedule.md +++ b/docs-src/docs/schedule.md @@ -10,7 +10,7 @@ Let's briefly discuss the main mechanisms and abstractions: A schedule specifica These define the **what** part of the schedule. The Policies define the **when** part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing. -The CompressionScheduler is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code. +The `CompressionScheduler` is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code. ## Syntax through example We'll use ```alexnet.schedule_agp.yaml``` to explain some of the YAML syntax for configuring Sensitivity Pruning of Alexnet. @@ -311,3 +311,74 @@ policies: ``` **Important Note**: As mentioned [here](design.md#training-with-quantization), since the quantizer modifies the model's parameters (assuming training with quantization in the loop is used), the call to `prepare_model()` must be performed before an optimizer is called. Therefore, currently, the starting epoch for a quantization policy must be 0, otherwise the quantization process will not work as expected. If one wishes to do a "warm-startup" (or "boot-strapping"), training for a few epochs with full precision and only then starting to quantize, the only way to do this right now is to execute a separate run to generate the boot-strapped weights, and execute a second which will resume the checkpoint with the boot-strapped weights. + +## Knowledge Distillation + +Knowledge distillation (see [here](knowledge_distillation.md)) is also implemented as a `Policy`, which should be added to the scheduler. However, with the current implementation, it cannot be defined within the YAML file like the rest of the policies described above. + +To make the integration of this method into applications a bit easier, a helper function can be used that will add a set of command-line arguments related to knowledge distillation: + +``` +import argparse +import distiller + +parser = argparse.ArgumentParser() +distiller.knowledge_distillation.add_distillation_args(parser) +``` + +(The `add_distillation_args` function accepts some optional arguments, see its implementation at `distiller/knowledge_distillation.py` for details) + +These are the command line arguments exposed by this function: + +``` +Knowledge Distillation Training Arguments: + --kd-teacher ARCH Model architecture for teacher model + --kd-pretrained Use pre-trained model for teacher + --kd-resume PATH Path to checkpoint from which to load teacher weights + --kd-temperature TEMP, --kd-temp TEMP + Knowledge distillation softmax temperature + --kd-distill-wt WEIGHT, --kd-dw WEIGHT + Weight for distillation loss (student vs. teacher soft + targets) + --kd-student-wt WEIGHT, --kd-sw WEIGHT + Weight for student vs. labels loss + --kd-teacher-wt WEIGHT, --kd-tw WEIGHT + Weight for teacher vs. labels loss + --kd-start-epoch EPOCH_NUM + Epoch from which to enable distillation + +``` + +Once arguments have been parsed, some initialization code is required, similar to the following: + +``` +# Assuming: +# "args" variable holds command line arguments +# "model" variable holds the model we're going to train, that is - the student model +# "compression_scheduler" variable holds a CompressionScheduler instance + +args.kd_policy = None +if args.kd_teacher: + # Create teacher model - replace this with your model creation code + teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) + if args.kd_resume: + teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) + + # Create policy and add to scheduler + dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) + args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) + compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, + frequency=1) +``` + +Finally, during the training loop, we need to perform forward propagation through the teacher model as well. The `KnowledgeDistillationPolicy` class keeps a reference to both the student and teacher models, and exposes a `forward` function that performs forward propagation on both of them. Since this is not one of the standard policy callbacks, we need to call this function manually from our training loop, as follows: + +``` +if args.kd_policy is None: + # Revert to a "normal" forward-prop call if no knowledge distillation policy is present + output = model(input_var) +else: + output = args.kd_policy.forward(input_var) +``` + +To see this integration in action, take a look at the image classification sample at `examples/classifier_compression/compress_classifier.py`. diff --git a/docs-src/mkdocs.yml b/docs-src/mkdocs.yml index a8a27a9cb9ce6f58b3a4cb0e84e687bd8917906b..13c369994ebbe3b7bb38a84e798d442584d56c88 100755 --- a/docs-src/mkdocs.yml +++ b/docs-src/mkdocs.yml @@ -21,6 +21,7 @@ pages: - 'Pruning': 'pruning.md' - 'Regularization': 'regularization.md' - 'Quantization': 'quantization.md' + - 'Knowledge Distillation': 'knowledge_distillation.md' - Algorithms: - Pruning: algo_pruning.md - Quantization: algo_quantization.md diff --git a/docs/404.html b/docs/404.html index 9ef461b5d22477d095deaf968a8c4b344af273b5..7ad708f6021fa413f2c7a43c515bbe9d0671e86e 100644 --- a/docs/404.html +++ b/docs/404.html @@ -77,6 +77,10 @@ <a class="" href="/quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="/knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/algo_pruning/index.html b/docs/algo_pruning/index.html index 4560e4db90bf06598d82b3c98dc7597370a00c23..acfcc9bf74a132ff92246d34f0150c55a3021331 100644 --- a/docs/algo_pruning/index.html +++ b/docs/algo_pruning/index.html @@ -84,6 +84,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> @@ -296,7 +300,7 @@ abundant and gradually reduce the number of weights being pruned each time as th <a href="../algo_quantization/index.html" class="btn btn-neutral float-right" title="Quantization">Next <span class="icon icon-circle-arrow-right"></span></a> - <a href="../quantization/index.html" class="btn btn-neutral" title="Quantization"><span class="icon icon-circle-arrow-left"></span> Previous</a> + <a href="../knowledge_distillation/index.html" class="btn btn-neutral" title="Knowledge Distillation"><span class="icon icon-circle-arrow-left"></span> Previous</a> </div> @@ -322,7 +326,7 @@ abundant and gradually reduce the number of weights being pruned each time as th <span class="rst-current-version" data-toggle="rst-current-version"> - <span><a href="../quantization/index.html" style="color: #fcfcfc;">« Previous</a></span> + <span><a href="../knowledge_distillation/index.html" style="color: #fcfcfc;">« Previous</a></span> <span style="margin-left: 15px"><a href="../algo_quantization/index.html" style="color: #fcfcfc">Next »</a></span> diff --git a/docs/algo_quantization/index.html b/docs/algo_quantization/index.html index 2225104c6fc0dc4706e2513bf8edddd58c440de3..5b13861658bb99da2d0ac73123bb429097d22a15 100644 --- a/docs/algo_quantization/index.html +++ b/docs/algo_quantization/index.html @@ -84,6 +84,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/design/index.html b/docs/design/index.html index 47497971ffcd83660ef52b1d5d23625f20823539..ee6e7135fceae4fb3ff4156b1a0b9330d1ee0fce 100644 --- a/docs/design/index.html +++ b/docs/design/index.html @@ -84,6 +84,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/imgs/decision_boundary.png b/docs/imgs/decision_boundary.png new file mode 100644 index 0000000000000000000000000000000000000000..a22c4c42c20cd31df791354bbc012655359d74d9 Binary files /dev/null and b/docs/imgs/decision_boundary.png differ diff --git a/docs/imgs/knowledge_distillation.png b/docs/imgs/knowledge_distillation.png new file mode 100644 index 0000000000000000000000000000000000000000..eb844185c9c511fba92fa48b51f9d6260a0db8cc Binary files /dev/null and b/docs/imgs/knowledge_distillation.png differ diff --git a/docs/index.html b/docs/index.html index 3fdb6e30888ec15897c32d55d6a14158dfe89b15..c97634d7f2d3771371e0a6bd29c8a6d2a80332ca 100644 --- a/docs/index.html +++ b/docs/index.html @@ -98,6 +98,10 @@ <a class="" href="quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> @@ -246,5 +250,5 @@ And of course, if we used a sparse or compressed representation, then we are red <!-- MkDocs version : 0.17.2 -Build Date UTC : 2018-07-22 11:48:56 +Build Date UTC : 2018-09-03 21:12:52 --> diff --git a/docs/install/index.html b/docs/install/index.html index 53328acfa61b00b34a781ce0181726c359ce4f39..494d64262f201c34487f91e28ed747a1d51fb2e1 100644 --- a/docs/install/index.html +++ b/docs/install/index.html @@ -100,6 +100,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/jupyter/index.html b/docs/jupyter/index.html index 76374b08b89f04e95a080fbaa380c0cb29c8332a..56da7227e4704d202fbe3f28afb151c47eec7bef 100644 --- a/docs/jupyter/index.html +++ b/docs/jupyter/index.html @@ -84,6 +84,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/knowledge_distillation/index.html b/docs/knowledge_distillation/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b140d8cdc4ff0a64430c9adcd7fed0d4eaf81231 --- /dev/null +++ b/docs/knowledge_distillation/index.html @@ -0,0 +1,268 @@ +<!DOCTYPE html> +<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> +<head> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + + + <link rel="shortcut icon" href="../img/favicon.ico"> + <title>Knowledge Distillation - Neural Network Distiller</title> + <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'> + + <link rel="stylesheet" href="../css/theme.css" type="text/css" /> + <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" /> + <link rel="stylesheet" href="../css/highlight.css"> + <link href="../extra.css" rel="stylesheet"> + + <script> + // Current page data + var mkdocs_page_name = "Knowledge Distillation"; + var mkdocs_page_input_path = "knowledge_distillation.md"; + var mkdocs_page_url = "/knowledge_distillation/index.html"; + </script> + + <script src="../js/jquery-2.1.1.min.js"></script> + <script src="../js/modernizr-2.8.3.min.js"></script> + <script type="text/javascript" src="../js/highlight.pack.js"></script> + +</head> + +<body class="wy-body-for-nav" role="document"> + + <div class="wy-grid-for-nav"> + + + <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav"> + <div class="wy-side-nav-search"> + <a href="../index.html" class="icon icon-home"> Neural Network Distiller</a> + <div role="search"> + <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get"> + <input type="text" name="q" placeholder="Search docs" /> + </form> +</div> + </div> + + <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> + <ul class="current"> + + + <li class="toctree-l1"> + + <a class="" href="../index.html">Home</a> + </li> + + <li class="toctree-l1"> + + <a class="" href="../install/index.html">Installation</a> + </li> + + <li class="toctree-l1"> + + <a class="" href="../usage/index.html">Usage</a> + </li> + + <li class="toctree-l1"> + + <a class="" href="../schedule/index.html">Compression scheduling</a> + </li> + + <li class="toctree-l1"> + + <span class="caption-text">Compressing models</span> + <ul class="subnav"> + <li class=""> + + <a class="" href="../pruning/index.html">Pruning</a> + </li> + <li class=""> + + <a class="" href="../regularization/index.html">Regularization</a> + </li> + <li class=""> + + <a class="" href="../quantization/index.html">Quantization</a> + </li> + <li class=" current"> + + <a class="current" href="index.html">Knowledge Distillation</a> + <ul class="subnav"> + + <li class="toctree-l3"><a href="#knowledge-distillation">Knowledge Distillation</a></li> + + <ul> + + <li><a class="toctree-l4" href="#new-hyper-parameters">New Hyper-Parameters</a></li> + + <li><a class="toctree-l4" href="#references">References</a></li> + + </ul> + + + </ul> + </li> + </ul> + </li> + + <li class="toctree-l1"> + + <span class="caption-text">Algorithms</span> + <ul class="subnav"> + <li class=""> + + <a class="" href="../algo_pruning/index.html">Pruning</a> + </li> + <li class=""> + + <a class="" href="../algo_quantization/index.html">Quantization</a> + </li> + </ul> + </li> + + <li class="toctree-l1"> + + <a class="" href="../model_zoo/index.html">Model Zoo</a> + </li> + + <li class="toctree-l1"> + + <a class="" href="../jupyter/index.html">Jupyter notebooks</a> + </li> + + <li class="toctree-l1"> + + <a class="" href="../design/index.html">Design</a> + </li> + + </ul> + </div> + + </nav> + + <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> + + + <nav class="wy-nav-top" role="navigation" aria-label="top navigation"> + <i data-toggle="wy-nav-top" class="fa fa-bars"></i> + <a href="../index.html">Neural Network Distiller</a> + </nav> + + + <div class="wy-nav-content"> + <div class="rst-content"> + <div role="navigation" aria-label="breadcrumbs navigation"> + <ul class="wy-breadcrumbs"> + <li><a href="../index.html">Docs</a> »</li> + + + + <li>Compressing models »</li> + + + + <li>Knowledge Distillation</li> + <li class="wy-breadcrumbs-aside"> + + </li> + </ul> + <hr/> +</div> + <div role="main"> + <div class="section"> + + <h1 id="knowledge-distillation">Knowledge Distillation</h1> +<p>(For details on how to train a model with knowledge distillation in Distiller, see <a href="../schedule/index.html#knowledge-distillation">here</a>)</p> +<p>Knowledge distillation is model compression method in which a small model is trained to mimic a pre-trained, larger model (or ensemble of models). This training setting is sometimes referred to as "teacher-student", where the large model is the teacher and the small model is the student (we'll be using these terms interchangeably).</p> +<p>The method was first proposed by <a href="#bucila-et-al-2006">Bucila et al., 2006</a> and generalized by <a href="#hinton-et-al-2015">Hinton et al., 2015</a>. The implementation in Distiller is based on the latter publication. Here we'll provide a summary of the method. For more information the reader may refer to the paper (a <a href="https://www.youtube.com/watch?v=EK61htlw8hY">video lecture</a> with <a href="http://www.ttic.edu/dl/dark14.pdf">slides</a> is also available).</p> +<p>In distillation, knowledge is transferred from the teacher model to the student by minimizing a loss function in which the target is the distribution of class probabilities predicted by the teacher model. That is - the output of a softmax function on the teacher model's logits. However, in many cases, this probability distribution has the correct class at a very high probability, with all other class probabilities very close to 0. As such, it doesn't provide much information beyond the ground truth labels already provided in the dataset. To tackle this issue, <a href="#hinton-et-al-2015">Hinton et al., 2015</a> introduced the concept of "softmax temperature". The probability <script type="math/tex">p_i</script> of class <script type="math/tex">i</script> is calculated from the logits <script type="math/tex">z</script> as:</p> +<p> +<script type="math/tex; mode=display">p_i = \frac{exp\left(\frac{z_i}{T}\right)}{\sum_{j} \exp\left(\frac{z_j}{T}\right)}</script> +</p> +<p>where <script type="math/tex">T</script> is the temperature parameter. When <script type="math/tex">T=1</script> we get the standard softmax function. As <script type="math/tex">T</script> grows, the probability distribution generated by the softmax function becomes softer, providing more information as to which classes the teacher found more similar to the predicted class. Hinton calls this the "dark knowledge" embedded in the teacher model, and it is this dark knowledge that we are transferring to the student model in the distillation process. When computing the loss function vs. the teacher's soft targets, we use the same value of <script type="math/tex">T</script> to compute the softmax on the student's logits. We call this loss the "distillation loss".</p> +<p><a href="#hinton-et-al-2015">Hinton et al., 2015</a> found that it is also beneficial to train the distilled model to produce the correct labels (based on the ground truth) in addition to the teacher's soft-labels. Hence, we also calculate the "standard" loss between the student's predicted class probabilities and the ground-truth labels (also called "hard labels/targets"). We dub this loss the "student loss". When calculating the class probabilities for the student loss we use <script type="math/tex">T = 1</script>. </p> +<p>The overall loss function, incorporating both distillation and student losses, is calculated as:</p> +<p> +<script type="math/tex; mode=display">\mathcal{L}(x;W) = \alpha * \mathcal{H}(y, \sigma(z_s; T=1)) + \beta * \mathcal{H}(\sigma(z_t; T=\tau), \sigma(z_s, T=\tau))</script> +</p> +<p>where <script type="math/tex">x</script> is the input, <script type="math/tex">W</script> are the student model parameters, <script type="math/tex">y</script> is the ground truth label, <script type="math/tex">\mathcal{H}</script> is the cross-entropy loss function, <script type="math/tex">\sigma</script> is the softmax function parameterized by the temperature <script type="math/tex">T</script>, and <script type="math/tex">\alpha</script> and <script type="math/tex">\beta</script> are coefficients. <script type="math/tex">z_s</script> and <script type="math/tex">z_t</script> are the logits of the student and teacher respectively.</p> +<p><img alt="Knowledge Distillation" src="../imgs/knowledge_distillation.png" /></p> +<h2 id="new-hyper-parameters">New Hyper-Parameters</h2> +<p>In general <script type="math/tex">\tau</script>, <script type="math/tex">\alpha</script> and <script type="math/tex">\beta</script> are hyper parameters.</p> +<p>In their experiments, <a href="#hinton-et-al-2015">Hinton et al., 2015</a> use temperature values ranging from 1 to 20. They note that empirically, when the student model is very small compared to the teacher model, lower temperatures work better. This makes sense if we consider that as we raise the temperature, the resulting soft-labels distribution becomes richer in information, and a very small model might not be able to capture all of this information. However, there's no clear way to predict up front what kind of capacity for information the student model will have.</p> +<p>With regards to <script type="math/tex">\alpha</script> and <script type="math/tex">\beta</script>, <a href="#hinton-et-al-2015">Hinton et al., 2015</a> use a weighted average between the distillation loss and the student loss. That is, <script type="math/tex">\beta = 1 - \alpha</script>. They note that in general, they obtained the best results when setting <script type="math/tex">\alpha</script> to be much smaller than <script type="math/tex">\beta</script> (although in one of their experiments they use <script type="math/tex">\alpha = \beta = 0.5</script>). Other works which utilize knowledge distillation don't use a weighted average. Some set <script type="math/tex">\alpha = 1</script> while leaving <script type="math/tex">\beta</script> tunable, while others don't set any constraints.</p> +<h2 id="combining-with-other-model-compression-techniques"><a name="combining"></a>Combining with Other Model Compression Techniques</h2> +<p>In the "basic" scenario, the smaller (student) model is a pre-defined architecture which just has a smaller number of parameters compared to the teacher model. For example, we could train ResNet-18 by distilling knowledge from ResNet-34. But, a model with smaller capacity can also be obtained by other model compression techniques - sparsification and/or quantization. So, for example, we could train a 4-bit ResNet-18 model with some method using quantization-aware training, and use a distillation loss function as described above. In that case, the teacher model can even be a FP32 ResNet-18 model. Same goes for pruning and regularization.</p> +<p><a href="#tann-et-al-2017">Tann et al., 2017</a>, <a href="#mishra-and-marr-2018">Mishra and Marr, 2018</a> and <a href="#polino-et-al-2018">Polino et al., 2018</a> are some works that combine knowledge distillation with <strong>quantization</strong>. <a href="#theis-et-al-2018">Theis et al., 2018</a> and <a href="#ashok-et-al-2018">Ashok et al., 2018</a> combine distillation with <strong>pruning</strong>.</p> +<h2 id="references">References</h2> +<p><div id="bucila-et-al-2006"></div> +<strong>Cristian Bucila, Rich Caruana, and Alexandru Niculescu-Mizil</strong>. Model Compression. <a href="https://www.cs.cornell.edu/~caruana/compression.kdd06.pdf">KDD, 2006</a></p> +<div id="hinton-et-al-2015"></div> + +<p><strong>Geoffrey Hinton, Oriol Vinyals and Jeff Dean</strong>. Distilling the Knowledge in a Neural Network. <a href="https://arxiv.org/abs/1503.02531">arxiv:1503.02531</a></p> +<div id="tann-et-al-2017"></div> + +<p><strong>Hokchhay Tann, Soheil Hashemi, Iris Bahar and Sherief Reda</strong>. Hardware-Software Codesign of Accurate, Multiplier-free Deep Neural Networks. <a href="https://arxiv.org/abs/1705.04288">DAC, 2017</a></p> +<div id="mishra-and-marr-2018"></div> + +<p><strong>Asit Mishra and Debbie Marr</strong>. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. <a href="https://openreview.net/forum?id=B1ae1lZRb">ICLR, 2018</a></p> +<div id="polino-et-al-2018"></div> + +<p><strong>Antonio Polino, Razvan Pascanu and Dan Alistarh</strong>. Model compression via distillation and quantization. <a href="https://openreview.net/forum?id=S1XolQbRW">ICLR, 2018</a></p> +<div id="ashok-et-al-2018"></div> + +<p><strong>Anubhav Ashok, Nicholas Rhinehart, Fares Beainy and Kris M. Kitani</strong>. N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning. <a href="https://openreview.net/forum?id=B1hcZZ-AW">ICLR, 2018</a></p> +<div id="theis-et-al-2018"></div> + +<p><strong>Lucas Theis, Iryna Korshunova, Alykhan Tejani and Ferenc Huszár</strong>. Faster gaze prediction with dense networks and Fisher pruning. <a href="https://arxiv.org/abs/1801.05787">arxiv:1801.05787</a></p> + + </div> + </div> + <footer> + + <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> + + <a href="../algo_pruning/index.html" class="btn btn-neutral float-right" title="Pruning">Next <span class="icon icon-circle-arrow-right"></span></a> + + + <a href="../quantization/index.html" class="btn btn-neutral" title="Quantization"><span class="icon icon-circle-arrow-left"></span> Previous</a> + + </div> + + + <hr/> + + <div role="contentinfo"> + <!-- Copyright etc --> + + </div> + + Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. +</footer> + + </div> + </div> + + </section> + + </div> + + <div class="rst-versions" role="note" style="cursor: pointer"> + <span class="rst-current-version" data-toggle="rst-current-version"> + + + <span><a href="../quantization/index.html" style="color: #fcfcfc;">« Previous</a></span> + + + <span style="margin-left: 15px"><a href="../algo_pruning/index.html" style="color: #fcfcfc">Next »</a></span> + + </span> +</div> + <script>var base_url = '..';</script> + <script src="../js/theme.js"></script> + <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script> + <script src="../search/require.js"></script> + <script src="../search/search.js"></script> + +</body> +</html> diff --git a/docs/model_zoo/index.html b/docs/model_zoo/index.html index 209447902298d8bbeaf24637470ad8da7bc83f6b..a2f518b50201c850db068c5baa8eb871e8537fa1 100644 --- a/docs/model_zoo/index.html +++ b/docs/model_zoo/index.html @@ -84,6 +84,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/pruning/index.html b/docs/pruning/index.html index 78f014302082a154ca8daaf4de80458825403f8a..00052fb0db3645c64ed779cbfd6a038b91477773 100644 --- a/docs/pruning/index.html +++ b/docs/pruning/index.html @@ -106,6 +106,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/quantization/index.html b/docs/quantization/index.html index 472633992779a549e1db2400767d8d8acc77167c..99b66221b3c1392fbb5c8bbe5553914c5ee1f157 100644 --- a/docs/quantization/index.html +++ b/docs/quantization/index.html @@ -106,6 +106,10 @@ </ul> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> @@ -227,7 +231,7 @@ As mentioned above, a scale factor is used to adapt the dynamic range of the ten <p>Naively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy:</p> <ul> <li><strong>Training / Re-Training</strong>: For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the <a href="#training-with-quantization">next section</a>.<br /> -<a href="#zhou-et-al-2016">Zhou S et al., 2016</a> have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods <em>require</em> a trained FP32 model, either as a starting point (<a href="#zhou-et-al-2017">Zhou A et al., 2017</a>), or as a teacher network in a knowledge distillation training setup (<a href="#mishra-and-marr-2018">Mishra and Marr, 2018</a>).</li> +<a href="#zhou-et-al-2016">Zhou S et al., 2016</a> have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods <em>require</em> a trained FP32 model, either as a starting point (<a href="#zhou-et-al-2017">Zhou A et al., 2017</a>), or as a teacher network in a knowledge distillation training setup (see <a href="../knowledge_distillation/index.html#combining">here</a>).</li> <li><strong>Replacing the activation function</strong>: The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used (<a href="#zhou-et-al-2016">Zhou S et al., 2016</a>, <a href="#mishra-et-al-2018">Mishra et al., 2018</a>). Another method learns the clipping value per layer, with better results (<a href="#choi-et-al-2018">Choi et al., 2018</a>). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above).</li> <li><strong>Modifying network structure</strong>: <a href="#mishra-et-al-2018">Mishra et al., 2018</a> try to compensate for the loss of information due to quantization by using wider layers (more channels). <a href="#lin-et-al-2017">Lin et al., 2017</a> proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different "base", covering a larger dynamic range overall.</li> <li><strong>First and last layer</strong>: Many methods do not quantize the first and last layer of the model. It has been observed by <a href="#han-et-al-2015">Han et al., 2015</a> that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically (<a href="#zhou-et-al-2016">Zhou S et al., 2016</a>, <a href="#choi-et-al-2018">Choi et al., 2018</a>). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them (<a href="#rastegari-et-al-2016">Rastegari et al., 2016</a>). Most methods keep the first and last layers at FP32. However, <a href="#choi-et-al-2018">Choi et al., 2018</a> showed that "conservative" quantization of these layers, e.g. to INT8, does not reduce accuracy.</li> @@ -262,9 +266,6 @@ In the diagram we show "layer N" as the conv + batch-norm + activation combinati <div id="zhou-et-al-2017"></div> <p><strong>Aojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen</strong>. Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. <a href="https://arxiv.org/abs/1702.03044">ICLR, 2017</a></p> -<div id="mishra-and-marr-2018"></div> - -<p><strong>Asit Mishra and Debbie Marr</strong>. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. <a href="https://openreview.net/forum?id=B1ae1lZRb">ICLR, 2018</a></p> <div id="mishra-et-al-2018"></div> <p><strong>Asit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr</strong>. WRPN: Wide Reduced-Precision Networks. <a href="https://openreview.net/forum?id=B1ZvaaeAZ">ICLR, 2018</a></p> @@ -296,7 +297,7 @@ In the diagram we show "layer N" as the conv + batch-norm + activation combinati <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> - <a href="../algo_pruning/index.html" class="btn btn-neutral float-right" title="Pruning">Next <span class="icon icon-circle-arrow-right"></span></a> + <a href="../knowledge_distillation/index.html" class="btn btn-neutral float-right" title="Knowledge Distillation">Next <span class="icon icon-circle-arrow-right"></span></a> <a href="../regularization/index.html" class="btn btn-neutral" title="Regularization"><span class="icon icon-circle-arrow-left"></span> Previous</a> @@ -328,7 +329,7 @@ In the diagram we show "layer N" as the conv + batch-norm + activation combinati <span><a href="../regularization/index.html" style="color: #fcfcfc;">« Previous</a></span> - <span style="margin-left: 15px"><a href="../algo_pruning/index.html" style="color: #fcfcfc">Next »</a></span> + <span style="margin-left: 15px"><a href="../knowledge_distillation/index.html" style="color: #fcfcfc">Next »</a></span> </span> </div> diff --git a/docs/regularization/index.html b/docs/regularization/index.html index c170f54c9055de9a0680a2cab5d97a4d53fb264a..e6bdf1b9a85987f9553e1e69c1febfcb9bd1f342 100644 --- a/docs/regularization/index.html +++ b/docs/regularization/index.html @@ -100,6 +100,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/schedule/index.html b/docs/schedule/index.html index 4f15baab41b3fcb39c9237f91d66c0eee19c9184..8634d178c72cde3cf3a4de07d07f4dac704bb32d 100644 --- a/docs/schedule/index.html +++ b/docs/schedule/index.html @@ -82,6 +82,8 @@ <li><a class="toctree-l3" href="#quantization">Quantization</a></li> + <li><a class="toctree-l3" href="#knowledge-distillation">Knowledge Distillation</a></li> + </ul> @@ -104,6 +106,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> @@ -179,7 +185,7 @@ </ul> <p>These define the <strong>what</strong> part of the schedule. </p> <p>The Policies define the <strong>when</strong> part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing.<br /> -The CompressionScheduler is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.</p> +The <code>CompressionScheduler</code> is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.</p> <h2 id="syntax-through-example">Syntax through example</h2> <p>We'll use <code>alexnet.schedule_agp.yaml</code> to explain some of the YAML syntax for configuring Sensitivity Pruning of Alexnet.</p> <pre><code>version: 1 @@ -465,6 +471,65 @@ Let's see an example:</p> </code></pre> <p><strong>Important Note</strong>: As mentioned <a href="../design/index.html#training-with-quantization">here</a>, since the quantizer modifies the model's parameters (assuming training with quantization in the loop is used), the call to <code>prepare_model()</code> must be performed before an optimizer is called. Therefore, currently, the starting epoch for a quantization policy must be 0, otherwise the quantization process will not work as expected. If one wishes to do a "warm-startup" (or "boot-strapping"), training for a few epochs with full precision and only then starting to quantize, the only way to do this right now is to execute a separate run to generate the boot-strapped weights, and execute a second which will resume the checkpoint with the boot-strapped weights.</p> +<h2 id="knowledge-distillation">Knowledge Distillation</h2> +<p>Knowledge distillation (see <a href="../knowledge_distillation/index.html">here</a>) is also implemented as a <code>Policy</code>, which should be added to the scheduler. However, with the current implementation, it cannot be defined within the YAML file like the rest of the policies described above.</p> +<p>To make the integration of this method into applications a bit easier, a helper function can be used that will add a set of command-line arguments related to knowledge distillation:</p> +<pre><code>import argparse +import distiller + +parser = argparse.ArgumentParser() +distiller.knowledge_distillation.add_distillation_args(parser) +</code></pre> + +<p>(The <code>add_distillation_args</code> function accepts some optional arguments, see its implementation at <code>distiller/knowledge_distillation.py</code> for details)</p> +<p>These are the command line arguments exposed by this function:</p> +<pre><code>Knowledge Distillation Training Arguments: + --kd-teacher ARCH Model architecture for teacher model + --kd-pretrained Use pre-trained model for teacher + --kd-resume PATH Path to checkpoint from which to load teacher weights + --kd-temperature TEMP, --kd-temp TEMP + Knowledge distillation softmax temperature + --kd-distill-wt WEIGHT, --kd-dw WEIGHT + Weight for distillation loss (student vs. teacher soft + targets) + --kd-student-wt WEIGHT, --kd-sw WEIGHT + Weight for student vs. labels loss + --kd-teacher-wt WEIGHT, --kd-tw WEIGHT + Weight for teacher vs. labels loss + --kd-start-epoch EPOCH_NUM + Epoch from which to enable distillation + +</code></pre> + +<p>Once arguments have been parsed, some initialization code is required, similar to the following:</p> +<pre><code># Assuming: +# "args" variable holds command line arguments +# "model" variable holds the model we're going to train, that is - the student model +# "compression_scheduler" variable holds a CompressionScheduler instance + +args.kd_policy = None +if args.kd_teacher: + # Create teacher model - replace this with your model creation code + teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) + if args.kd_resume: + teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) + + # Create policy and add to scheduler + dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) + args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) + compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, + frequency=1) +</code></pre> + +<p>Finally, during the training loop, we need to perform forward propagation through the teacher model as well. The <code>KnowledgeDistillationPolicy</code> class keeps a reference to both the student and teacher models, and exposes a <code>forward</code> function that performs forward propagation on both of them. Since this is not one of the standard policy callbacks, we need to call this function manually from our training loop, as follows:</p> +<pre><code>if args.kd_policy is None: + # Revert to a "normal" forward-prop call if no knowledge distillation policy is present + output = model(input_var) +else: + output = args.kd_policy.forward(input_var) +</code></pre> + +<p>To see this integration in action, take a look at the image classification sample at <code>examples/classifier_compression/compress_classifier.py</code>.</p> </div> </div> diff --git a/docs/search.html b/docs/search.html index 7e081ab8457aec6b0eac6a48589a2ebcdb0ce7f0..60998fdbfa0cddbf439d40e91f33db197c0e656e 100644 --- a/docs/search.html +++ b/docs/search.html @@ -77,6 +77,10 @@ <a class="" href="quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/docs/search/search_index.json b/docs/search/search_index.json index be04e4340483834cb48cc927d45ce9357279ebc3..b46c281d86cea76f813870eb379ab0b98fe55567 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -137,7 +137,7 @@ }, { "location": "/schedule/index.html", - "text": "Compression scheduler\n\n\nIn iterative pruning, we create some kind of pruning regimen that specifies how to prune, and what to prune at every stage of the pruning and training stages. This motivated the design of \nCompressionScheduler\n: it needed to be part of the training loop, and to be able to make and implement pruning, regularization and quantization decisions. We wanted to be able to change the particulars of the compression schedule, w/o touching the code, and settled on using YAML as a container for this specification. We found that when we make many experiments on the same code base, it is easier to maintain all of these experiments if we decouple the differences from the code-base. Therefore, we added to the scheduler support for learning-rate decay scheduling because, again, we wanted the freedom to change the LR-decay policy without changing code. \n\n\nHigh level overview\n\n\nLet's briefly discuss the main mechanisms and abstractions: A schedule specification is composed of a list of sections defining instances of Pruners, Regularizers, Quantizers, LR-scheduler and Policies.\n\n\n\n\nPruners, Regularizers and Quantizers are very similar: They implement either a Pruning/Regularization/Quantization algorithm, respectively. \n\n\nAn LR-scheduler specifies the LR-decay algorithm. \n\n\n\n\nThese define the \nwhat\n part of the schedule. \n\n\nThe Policies define the \nwhen\n part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing.\n\nThe CompressionScheduler is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.\n\n\nSyntax through example\n\n\nWe'll use \nalexnet.schedule_agp.yaml\n to explain some of the YAML syntax for configuring Sensitivity Pruning of Alexnet.\n\n\nversion: 1\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.625\n\nlr_schedulers:\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\nThere is only one version of the YAML syntax, and the version number is not verified at the moment. However, to be future-proof it is probably better to let the YAML parser know that you are using version-1 syntax, in case there is ever a version 2.\n\n\nversion: 1\n\n\n\n\nIn the \npruners\n section, we define the instances of pruners we want the scheduler to instantiate and use.\n\nWe define a single pruner instance, named \nmy_pruner\n, of algorithm \nSensitivityPruner\n. We will refer to this instance in the \nPolicies\n section.\n\nThen we list the sensitivity multipliers, \\(s\\), of each of the weight tensors.\n\nYou may list as many Pruners as you want in this section, as long as each has a unique name. You can several types of pruners in one schedule.\n\n\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.6\n\n\n\n\nNext, we want to specify the learning-rate decay scheduling in the \nlr_schedulers\n section. We assign a name to this instance: \npruning_lr\n. As in the \npruners\n section, you may use any name, as long as all LR-schedulers have a unique name. At the moment, only one instance of LR-scheduler is allowed. The LR-scheduler must be a subclass of PyTorch's \n_LRScheduler\n. You can use any of the schedulers defined in \ntorch.optim.lr_scheduler\n (see \nhere\n). In addition, we've implemented some additional schedulers in Distiller (see \nhere\n). The keyword arguments (kwargs) are passed directly to the LR-scheduler's constructor, so that as new LR-schedulers are added to \ntorch.optim.lr_scheduler\n, they can be used without changing the application code.\n\n\nlr_schedulers:\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\n\n\n\nFinally, we define the \npolicies\n section which defines the actual scheduling. A \nPolicy\n manages an instance of a \nPruner\n, \nRegularizer\n, \nQuantizer\n, or \nLRScheduler\n, by naming the instance. In the example below, a \nPruningPolicy\n uses the pruner instance named \nmy_pruner\n: it activates it at a frequency of 2 epochs (i.e. every other epoch), starting at epoch 0, and ending at epoch 38. \n\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\nThis is \niterative pruning\n:\n\n\n\n\n\n\nTrain Connectivity\n\n\n\n\n\n\nPrune Connections\n\n\n\n\n\n\nRetrain Weights\n\n\n\n\n\n\nGoto 2\n\n\n\n\n\n\nIt is described in \nLearning both Weights and Connections for Efficient Neural Networks\n:\n\n\n\n\n\"Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections...After an initial training phase, we remove all connections whose weight is lower than a threshold. This pruning converts a dense, fully-connected layer to a sparse layer. This first phase learns the topology of the networks \u2014 learning which connections are important and removing the unimportant connections. We then retrain the sparse network so the remaining connections can compensate for the connections that have been removed. The phases of pruning and retraining may be repeated iteratively to further reduce network complexity.\"\n\n\n\n\nRegularization\n\n\nYou can also define and schedule regularization.\n\n\nL1 regularization\n\n\nFormat (this is an informal specification, not a valid \nABNF\n specification):\n\n\nregularizers:\n \nREGULARIZER_NAME_STR\n:\n class: L1Regularizer\n reg_regims:\n \nPYTORCH_PARAM_NAME_STR\n: \nSTRENGTH_FLOAT\n\n ...\n \nPYTORCH_PARAM_NAME_STR\n: \nSTRENGTH_FLOAT\n\n threshold_criteria: [Mean_Abs | Max]\n\n\n\n\nFor example:\n\n\nversion: 1\n\nregularizers:\n my_L1_reg:\n class: L1Regularizer\n reg_regims:\n 'module.layer3.1.conv1.weight': 0.000002\n 'module.layer3.1.conv2.weight': 0.000002\n 'module.layer3.1.conv3.weight': 0.000002\n 'module.layer3.2.conv1.weight': 0.000002\n threshold_criteria: Mean_Abs\n\npolicies:\n - regularizer:\n instance_name: my_L1_reg\n starting_epoch: 0\n ending_epoch: 60\n frequency: 1\n\n\n\n\nGroup regularization\n\n\nFormat (informal specification):\n\n\nFormat:\n regularizers:\n \nREGULARIZER_NAME_STR\n:\n class: L1Regularizer\n reg_regims:\n \nPYTORCH_PARAM_NAME_STR\n: [\nSTRENGTH_FLOAT\n, \n'2D' | '3D' | '4D' | 'Channels' | 'Cols' | 'Rows'\n]\n \nPYTORCH_PARAM_NAME_STR\n: [\nSTRENGTH_FLOAT\n, \n'2D' | '3D' | '4D' | 'Channels' | 'Cols' | 'Rows'\n]\n threshold_criteria: [Mean_Abs | Max]\n\n\n\n\nFor example:\n\n\nversion: 1\n\nregularizers:\n my_filter_regularizer:\n class: GroupLassoRegularizer\n reg_regims:\n 'module.layer3.1.conv1.weight': [0.00005, '3D']\n 'module.layer3.1.conv2.weight': [0.00005, '3D']\n 'module.layer3.1.conv3.weight': [0.00005, '3D']\n 'module.layer3.2.conv1.weight': [0.00005, '3D']\n threshold_criteria: Mean_Abs\n\npolicies:\n - regularizer:\n instance_name: my_filter_regularizer\n starting_epoch: 0\n ending_epoch: 60\n frequency: 1\n\n\n\n\nMixing it up\n\n\nYou can mix pruning and regularization.\n\n\nversion: 1\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.625\n\nregularizers:\n 2d_groups_regularizer:\n class: GroupLassoRegularizer\n reg_regims:\n 'features.module.0.weight': [0.000012, '2D']\n 'features.module.3.weight': [0.000012, '2D']\n 'features.module.6.weight': [0.000012, '2D']\n 'features.module.8.weight': [0.000012, '2D']\n 'features.module.10.weight': [0.000012, '2D']\n\n\nlr_schedulers:\n # Learning rate decay scheduler\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - regularizer:\n instance_name: '2d_groups_regularizer'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 1\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\n\nQuantization\n\n\nSimilarly to pruners and regularizers, specifying a quantizer in the scheduler YAML follows the constructor arguments of the \nQuantizer\n class (see details \nhere\n).\n\n\nNotes\n: Only a single quantizer instance may be defined.\n\nLet's see an example:\n\n\nquantizers:\n dorefa_quantizer:\n class: DorefaQuantizer\n bits_activations: 8\n bits_weights: 4\n bits_overrides:\n conv1:\n wts: null\n acts: null\n relu1:\n wts: null\n acts: null\n final_relu:\n wts: null\n acts: null\n fc:\n wts: null\n acts: null\n\n\n\n\n\n\nThe specific quantization method we're instantiating here is \nDorefaQuantizer\n.\n\n\nThen we define the default bit-widths for activations and weights, in this case 8 and 4-bits, respectively. \n\n\nThen, we define the \nbits_overrides\n mapping. In the example above, we choose not to quantize the first and last layer of the model. In the case of \nDorefaQuantizer\n, the weights are quantized as part of the convolution / FC layers, but the activations are quantized in separate layers, which replace the ReLU layers in the original model (remember - even though we replaced the ReLU modules with our own quantization modules, the name of the modules isn't changed). So, in all, we need to reference the first layer with parameters \nconv1\n, the first activation layer \nrelu1\n, the last activation layer \nfinal_relu\n and the last layer with parameters \nfc\n.\n\n\nSpecifying \nnull\n means \"do not quantize\".\n\n\nNote that for quantizers, we reference names of modules, not names of parameters as we do for pruners and regularizers.\n\n\n\n\nDefining overrides for \ngroups of layers\n using regular expressions\n\n\nSuppose we have a sub-module in our model named \nblock1\n, which contains multiple convolution layers which we would like to quantize to, say, 2-bits. The convolution layers are named \nconv1\n, \nconv2\n and so on. In that case we would define the following:\n\n\nbits_overrides:\n 'block1\\.conv*':\n wts: 2\n acts: null\n\n\n\n\n\n\nRegEx Note\n: Remember that the dot (\n.\n) is a meta-character (i.e. a reserved character) in regular expressions. So, to match the actual dot characters which separate sub-modules in PyTorch module names, we need to escape it: \n\\.\n\n\n\n\nOverlapping patterns\n are also possible, which allows to define some override for a groups of layers and also \"single-out\" specific layers for different overrides. For example, let's take the last example and configure a different override for \nblock1.conv1\n:\n\n\nbits_overrides:\n 'block1\\.conv1':\n wts: 4\n acts: null\n 'block1\\.conv*':\n wts: 2\n acts: null\n\n\n\n\n\n\nImportant Note\n: The patterns are evaluated eagerly - first match wins. So, to properly quantize a model using \"broad\" patterns and more \"specific\" patterns as just shown, make sure the specific pattern is listed \nbefore\n the broad one.\n\n\n\n\nThe \nQuantizationPolicy\n, which controls the quantization procedure during training, is actually quite simplistic. All it does is call the \nprepare_model()\n function of the \nQuantizer\n when it's initialized, followed by the first call to \nquantize_params()\n. Then, at the end of each epoch, after the float copy of the weights has been updated, it calls the \nquantize_params()\n function again. \n\n\npolicies:\n - quantizer:\n instance_name: dorefa_quantizer\n starting_epoch: 0\n ending_epoch: 200\n frequency: 1\n\n\n\n\nImportant Note\n: As mentioned \nhere\n, since the quantizer modifies the model's parameters (assuming training with quantization in the loop is used), the call to \nprepare_model()\n must be performed before an optimizer is called. Therefore, currently, the starting epoch for a quantization policy must be 0, otherwise the quantization process will not work as expected. If one wishes to do a \"warm-startup\" (or \"boot-strapping\"), training for a few epochs with full precision and only then starting to quantize, the only way to do this right now is to execute a separate run to generate the boot-strapped weights, and execute a second which will resume the checkpoint with the boot-strapped weights.", + "text": "Compression scheduler\n\n\nIn iterative pruning, we create some kind of pruning regimen that specifies how to prune, and what to prune at every stage of the pruning and training stages. This motivated the design of \nCompressionScheduler\n: it needed to be part of the training loop, and to be able to make and implement pruning, regularization and quantization decisions. We wanted to be able to change the particulars of the compression schedule, w/o touching the code, and settled on using YAML as a container for this specification. We found that when we make many experiments on the same code base, it is easier to maintain all of these experiments if we decouple the differences from the code-base. Therefore, we added to the scheduler support for learning-rate decay scheduling because, again, we wanted the freedom to change the LR-decay policy without changing code. \n\n\nHigh level overview\n\n\nLet's briefly discuss the main mechanisms and abstractions: A schedule specification is composed of a list of sections defining instances of Pruners, Regularizers, Quantizers, LR-scheduler and Policies.\n\n\n\n\nPruners, Regularizers and Quantizers are very similar: They implement either a Pruning/Regularization/Quantization algorithm, respectively. \n\n\nAn LR-scheduler specifies the LR-decay algorithm. \n\n\n\n\nThese define the \nwhat\n part of the schedule. \n\n\nThe Policies define the \nwhen\n part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing.\n\nThe \nCompressionScheduler\n is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.\n\n\nSyntax through example\n\n\nWe'll use \nalexnet.schedule_agp.yaml\n to explain some of the YAML syntax for configuring Sensitivity Pruning of Alexnet.\n\n\nversion: 1\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.625\n\nlr_schedulers:\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\nThere is only one version of the YAML syntax, and the version number is not verified at the moment. However, to be future-proof it is probably better to let the YAML parser know that you are using version-1 syntax, in case there is ever a version 2.\n\n\nversion: 1\n\n\n\n\nIn the \npruners\n section, we define the instances of pruners we want the scheduler to instantiate and use.\n\nWe define a single pruner instance, named \nmy_pruner\n, of algorithm \nSensitivityPruner\n. We will refer to this instance in the \nPolicies\n section.\n\nThen we list the sensitivity multipliers, \\(s\\), of each of the weight tensors.\n\nYou may list as many Pruners as you want in this section, as long as each has a unique name. You can several types of pruners in one schedule.\n\n\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.6\n\n\n\n\nNext, we want to specify the learning-rate decay scheduling in the \nlr_schedulers\n section. We assign a name to this instance: \npruning_lr\n. As in the \npruners\n section, you may use any name, as long as all LR-schedulers have a unique name. At the moment, only one instance of LR-scheduler is allowed. The LR-scheduler must be a subclass of PyTorch's \n_LRScheduler\n. You can use any of the schedulers defined in \ntorch.optim.lr_scheduler\n (see \nhere\n). In addition, we've implemented some additional schedulers in Distiller (see \nhere\n). The keyword arguments (kwargs) are passed directly to the LR-scheduler's constructor, so that as new LR-schedulers are added to \ntorch.optim.lr_scheduler\n, they can be used without changing the application code.\n\n\nlr_schedulers:\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\n\n\n\nFinally, we define the \npolicies\n section which defines the actual scheduling. A \nPolicy\n manages an instance of a \nPruner\n, \nRegularizer\n, \nQuantizer\n, or \nLRScheduler\n, by naming the instance. In the example below, a \nPruningPolicy\n uses the pruner instance named \nmy_pruner\n: it activates it at a frequency of 2 epochs (i.e. every other epoch), starting at epoch 0, and ending at epoch 38. \n\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\nThis is \niterative pruning\n:\n\n\n\n\n\n\nTrain Connectivity\n\n\n\n\n\n\nPrune Connections\n\n\n\n\n\n\nRetrain Weights\n\n\n\n\n\n\nGoto 2\n\n\n\n\n\n\nIt is described in \nLearning both Weights and Connections for Efficient Neural Networks\n:\n\n\n\n\n\"Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections...After an initial training phase, we remove all connections whose weight is lower than a threshold. This pruning converts a dense, fully-connected layer to a sparse layer. This first phase learns the topology of the networks \u2014 learning which connections are important and removing the unimportant connections. We then retrain the sparse network so the remaining connections can compensate for the connections that have been removed. The phases of pruning and retraining may be repeated iteratively to further reduce network complexity.\"\n\n\n\n\nRegularization\n\n\nYou can also define and schedule regularization.\n\n\nL1 regularization\n\n\nFormat (this is an informal specification, not a valid \nABNF\n specification):\n\n\nregularizers:\n \nREGULARIZER_NAME_STR\n:\n class: L1Regularizer\n reg_regims:\n \nPYTORCH_PARAM_NAME_STR\n: \nSTRENGTH_FLOAT\n\n ...\n \nPYTORCH_PARAM_NAME_STR\n: \nSTRENGTH_FLOAT\n\n threshold_criteria: [Mean_Abs | Max]\n\n\n\n\nFor example:\n\n\nversion: 1\n\nregularizers:\n my_L1_reg:\n class: L1Regularizer\n reg_regims:\n 'module.layer3.1.conv1.weight': 0.000002\n 'module.layer3.1.conv2.weight': 0.000002\n 'module.layer3.1.conv3.weight': 0.000002\n 'module.layer3.2.conv1.weight': 0.000002\n threshold_criteria: Mean_Abs\n\npolicies:\n - regularizer:\n instance_name: my_L1_reg\n starting_epoch: 0\n ending_epoch: 60\n frequency: 1\n\n\n\n\nGroup regularization\n\n\nFormat (informal specification):\n\n\nFormat:\n regularizers:\n \nREGULARIZER_NAME_STR\n:\n class: L1Regularizer\n reg_regims:\n \nPYTORCH_PARAM_NAME_STR\n: [\nSTRENGTH_FLOAT\n, \n'2D' | '3D' | '4D' | 'Channels' | 'Cols' | 'Rows'\n]\n \nPYTORCH_PARAM_NAME_STR\n: [\nSTRENGTH_FLOAT\n, \n'2D' | '3D' | '4D' | 'Channels' | 'Cols' | 'Rows'\n]\n threshold_criteria: [Mean_Abs | Max]\n\n\n\n\nFor example:\n\n\nversion: 1\n\nregularizers:\n my_filter_regularizer:\n class: GroupLassoRegularizer\n reg_regims:\n 'module.layer3.1.conv1.weight': [0.00005, '3D']\n 'module.layer3.1.conv2.weight': [0.00005, '3D']\n 'module.layer3.1.conv3.weight': [0.00005, '3D']\n 'module.layer3.2.conv1.weight': [0.00005, '3D']\n threshold_criteria: Mean_Abs\n\npolicies:\n - regularizer:\n instance_name: my_filter_regularizer\n starting_epoch: 0\n ending_epoch: 60\n frequency: 1\n\n\n\n\nMixing it up\n\n\nYou can mix pruning and regularization.\n\n\nversion: 1\npruners:\n my_pruner:\n class: 'SensitivityPruner'\n sensitivities:\n 'features.module.0.weight': 0.25\n 'features.module.3.weight': 0.35\n 'features.module.6.weight': 0.40\n 'features.module.8.weight': 0.45\n 'features.module.10.weight': 0.55\n 'classifier.1.weight': 0.875\n 'classifier.4.weight': 0.875\n 'classifier.6.weight': 0.625\n\nregularizers:\n 2d_groups_regularizer:\n class: GroupLassoRegularizer\n reg_regims:\n 'features.module.0.weight': [0.000012, '2D']\n 'features.module.3.weight': [0.000012, '2D']\n 'features.module.6.weight': [0.000012, '2D']\n 'features.module.8.weight': [0.000012, '2D']\n 'features.module.10.weight': [0.000012, '2D']\n\n\nlr_schedulers:\n # Learning rate decay scheduler\n pruning_lr:\n class: ExponentialLR\n gamma: 0.9\n\npolicies:\n - pruner:\n instance_name : 'my_pruner'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 2\n\n - regularizer:\n instance_name: '2d_groups_regularizer'\n starting_epoch: 0\n ending_epoch: 38\n frequency: 1\n\n - lr_scheduler:\n instance_name: pruning_lr\n starting_epoch: 24\n ending_epoch: 200\n frequency: 1\n\n\n\n\n\nQuantization\n\n\nSimilarly to pruners and regularizers, specifying a quantizer in the scheduler YAML follows the constructor arguments of the \nQuantizer\n class (see details \nhere\n).\n\n\nNotes\n: Only a single quantizer instance may be defined.\n\nLet's see an example:\n\n\nquantizers:\n dorefa_quantizer:\n class: DorefaQuantizer\n bits_activations: 8\n bits_weights: 4\n bits_overrides:\n conv1:\n wts: null\n acts: null\n relu1:\n wts: null\n acts: null\n final_relu:\n wts: null\n acts: null\n fc:\n wts: null\n acts: null\n\n\n\n\n\n\nThe specific quantization method we're instantiating here is \nDorefaQuantizer\n.\n\n\nThen we define the default bit-widths for activations and weights, in this case 8 and 4-bits, respectively. \n\n\nThen, we define the \nbits_overrides\n mapping. In the example above, we choose not to quantize the first and last layer of the model. In the case of \nDorefaQuantizer\n, the weights are quantized as part of the convolution / FC layers, but the activations are quantized in separate layers, which replace the ReLU layers in the original model (remember - even though we replaced the ReLU modules with our own quantization modules, the name of the modules isn't changed). So, in all, we need to reference the first layer with parameters \nconv1\n, the first activation layer \nrelu1\n, the last activation layer \nfinal_relu\n and the last layer with parameters \nfc\n.\n\n\nSpecifying \nnull\n means \"do not quantize\".\n\n\nNote that for quantizers, we reference names of modules, not names of parameters as we do for pruners and regularizers.\n\n\n\n\nDefining overrides for \ngroups of layers\n using regular expressions\n\n\nSuppose we have a sub-module in our model named \nblock1\n, which contains multiple convolution layers which we would like to quantize to, say, 2-bits. The convolution layers are named \nconv1\n, \nconv2\n and so on. In that case we would define the following:\n\n\nbits_overrides:\n 'block1\\.conv*':\n wts: 2\n acts: null\n\n\n\n\n\n\nRegEx Note\n: Remember that the dot (\n.\n) is a meta-character (i.e. a reserved character) in regular expressions. So, to match the actual dot characters which separate sub-modules in PyTorch module names, we need to escape it: \n\\.\n\n\n\n\nOverlapping patterns\n are also possible, which allows to define some override for a groups of layers and also \"single-out\" specific layers for different overrides. For example, let's take the last example and configure a different override for \nblock1.conv1\n:\n\n\nbits_overrides:\n 'block1\\.conv1':\n wts: 4\n acts: null\n 'block1\\.conv*':\n wts: 2\n acts: null\n\n\n\n\n\n\nImportant Note\n: The patterns are evaluated eagerly - first match wins. So, to properly quantize a model using \"broad\" patterns and more \"specific\" patterns as just shown, make sure the specific pattern is listed \nbefore\n the broad one.\n\n\n\n\nThe \nQuantizationPolicy\n, which controls the quantization procedure during training, is actually quite simplistic. All it does is call the \nprepare_model()\n function of the \nQuantizer\n when it's initialized, followed by the first call to \nquantize_params()\n. Then, at the end of each epoch, after the float copy of the weights has been updated, it calls the \nquantize_params()\n function again. \n\n\npolicies:\n - quantizer:\n instance_name: dorefa_quantizer\n starting_epoch: 0\n ending_epoch: 200\n frequency: 1\n\n\n\n\nImportant Note\n: As mentioned \nhere\n, since the quantizer modifies the model's parameters (assuming training with quantization in the loop is used), the call to \nprepare_model()\n must be performed before an optimizer is called. Therefore, currently, the starting epoch for a quantization policy must be 0, otherwise the quantization process will not work as expected. If one wishes to do a \"warm-startup\" (or \"boot-strapping\"), training for a few epochs with full precision and only then starting to quantize, the only way to do this right now is to execute a separate run to generate the boot-strapped weights, and execute a second which will resume the checkpoint with the boot-strapped weights.\n\n\nKnowledge Distillation\n\n\nKnowledge distillation (see \nhere\n) is also implemented as a \nPolicy\n, which should be added to the scheduler. However, with the current implementation, it cannot be defined within the YAML file like the rest of the policies described above.\n\n\nTo make the integration of this method into applications a bit easier, a helper function can be used that will add a set of command-line arguments related to knowledge distillation:\n\n\nimport argparse\nimport distiller\n\nparser = argparse.ArgumentParser()\ndistiller.knowledge_distillation.add_distillation_args(parser)\n\n\n\n\n(The \nadd_distillation_args\n function accepts some optional arguments, see its implementation at \ndistiller/knowledge_distillation.py\n for details)\n\n\nThese are the command line arguments exposed by this function:\n\n\nKnowledge Distillation Training Arguments:\n --kd-teacher ARCH Model architecture for teacher model\n --kd-pretrained Use pre-trained model for teacher\n --kd-resume PATH Path to checkpoint from which to load teacher weights\n --kd-temperature TEMP, --kd-temp TEMP\n Knowledge distillation softmax temperature\n --kd-distill-wt WEIGHT, --kd-dw WEIGHT\n Weight for distillation loss (student vs. teacher soft\n targets)\n --kd-student-wt WEIGHT, --kd-sw WEIGHT\n Weight for student vs. labels loss\n --kd-teacher-wt WEIGHT, --kd-tw WEIGHT\n Weight for teacher vs. labels loss\n --kd-start-epoch EPOCH_NUM\n Epoch from which to enable distillation\n\n\n\n\n\nOnce arguments have been parsed, some initialization code is required, similar to the following:\n\n\n# Assuming:\n# \nargs\n variable holds command line arguments\n# \nmodel\n variable holds the model we're going to train, that is - the student model\n# \ncompression_scheduler\n variable holds a CompressionScheduler instance\n\nargs.kd_policy = None\nif args.kd_teacher:\n # Create teacher model - replace this with your model creation code\n teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus)\n if args.kd_resume:\n teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume)\n\n # Create policy and add to scheduler\n dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt)\n args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw)\n compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs,\n frequency=1)\n\n\n\n\nFinally, during the training loop, we need to perform forward propagation through the teacher model as well. The \nKnowledgeDistillationPolicy\n class keeps a reference to both the student and teacher models, and exposes a \nforward\n function that performs forward propagation on both of them. Since this is not one of the standard policy callbacks, we need to call this function manually from our training loop, as follows:\n\n\nif args.kd_policy is None:\n # Revert to a \nnormal\n forward-prop call if no knowledge distillation policy is present\n output = model(input_var)\nelse:\n output = args.kd_policy.forward(input_var)\n\n\n\n\nTo see this integration in action, take a look at the image classification sample at \nexamples/classifier_compression/compress_classifier.py\n.", "title": "Compression scheduling" }, { @@ -147,7 +147,7 @@ }, { "location": "/schedule/index.html#high-level-overview", - "text": "Let's briefly discuss the main mechanisms and abstractions: A schedule specification is composed of a list of sections defining instances of Pruners, Regularizers, Quantizers, LR-scheduler and Policies. Pruners, Regularizers and Quantizers are very similar: They implement either a Pruning/Regularization/Quantization algorithm, respectively. An LR-scheduler specifies the LR-decay algorithm. These define the what part of the schedule. The Policies define the when part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing. \nThe CompressionScheduler is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.", + "text": "Let's briefly discuss the main mechanisms and abstractions: A schedule specification is composed of a list of sections defining instances of Pruners, Regularizers, Quantizers, LR-scheduler and Policies. Pruners, Regularizers and Quantizers are very similar: They implement either a Pruning/Regularization/Quantization algorithm, respectively. An LR-scheduler specifies the LR-decay algorithm. These define the what part of the schedule. The Policies define the when part of the schedule: at which epoch to start applying the Pruner/Regularizer/Quantizer/LR-decay, the epoch to end, and how often to invoke the policy (frequency of application). A policy also defines the instance of Pruner/Regularizer/Quantizer/LR-decay it is managing. \nThe CompressionScheduler is configured from a YAML file or from a dictionary, but you can also manually create Policies, Pruners, Regularizers and Quantizers from code.", "title": "High level overview" }, { @@ -185,6 +185,11 @@ "text": "Suppose we have a sub-module in our model named block1 , which contains multiple convolution layers which we would like to quantize to, say, 2-bits. The convolution layers are named conv1 , conv2 and so on. In that case we would define the following: bits_overrides:\n 'block1\\.conv*':\n wts: 2\n acts: null RegEx Note : Remember that the dot ( . ) is a meta-character (i.e. a reserved character) in regular expressions. So, to match the actual dot characters which separate sub-modules in PyTorch module names, we need to escape it: \\. Overlapping patterns are also possible, which allows to define some override for a groups of layers and also \"single-out\" specific layers for different overrides. For example, let's take the last example and configure a different override for block1.conv1 : bits_overrides:\n 'block1\\.conv1':\n wts: 4\n acts: null\n 'block1\\.conv*':\n wts: 2\n acts: null Important Note : The patterns are evaluated eagerly - first match wins. So, to properly quantize a model using \"broad\" patterns and more \"specific\" patterns as just shown, make sure the specific pattern is listed before the broad one. The QuantizationPolicy , which controls the quantization procedure during training, is actually quite simplistic. All it does is call the prepare_model() function of the Quantizer when it's initialized, followed by the first call to quantize_params() . Then, at the end of each epoch, after the float copy of the weights has been updated, it calls the quantize_params() function again. policies:\n - quantizer:\n instance_name: dorefa_quantizer\n starting_epoch: 0\n ending_epoch: 200\n frequency: 1 Important Note : As mentioned here , since the quantizer modifies the model's parameters (assuming training with quantization in the loop is used), the call to prepare_model() must be performed before an optimizer is called. Therefore, currently, the starting epoch for a quantization policy must be 0, otherwise the quantization process will not work as expected. If one wishes to do a \"warm-startup\" (or \"boot-strapping\"), training for a few epochs with full precision and only then starting to quantize, the only way to do this right now is to execute a separate run to generate the boot-strapped weights, and execute a second which will resume the checkpoint with the boot-strapped weights.", "title": "Defining overrides for groups of layers using regular expressions" }, + { + "location": "/schedule/index.html#knowledge-distillation", + "text": "Knowledge distillation (see here ) is also implemented as a Policy , which should be added to the scheduler. However, with the current implementation, it cannot be defined within the YAML file like the rest of the policies described above. To make the integration of this method into applications a bit easier, a helper function can be used that will add a set of command-line arguments related to knowledge distillation: import argparse\nimport distiller\n\nparser = argparse.ArgumentParser()\ndistiller.knowledge_distillation.add_distillation_args(parser) (The add_distillation_args function accepts some optional arguments, see its implementation at distiller/knowledge_distillation.py for details) These are the command line arguments exposed by this function: Knowledge Distillation Training Arguments:\n --kd-teacher ARCH Model architecture for teacher model\n --kd-pretrained Use pre-trained model for teacher\n --kd-resume PATH Path to checkpoint from which to load teacher weights\n --kd-temperature TEMP, --kd-temp TEMP\n Knowledge distillation softmax temperature\n --kd-distill-wt WEIGHT, --kd-dw WEIGHT\n Weight for distillation loss (student vs. teacher soft\n targets)\n --kd-student-wt WEIGHT, --kd-sw WEIGHT\n Weight for student vs. labels loss\n --kd-teacher-wt WEIGHT, --kd-tw WEIGHT\n Weight for teacher vs. labels loss\n --kd-start-epoch EPOCH_NUM\n Epoch from which to enable distillation Once arguments have been parsed, some initialization code is required, similar to the following: # Assuming:\n# args variable holds command line arguments\n# model variable holds the model we're going to train, that is - the student model\n# compression_scheduler variable holds a CompressionScheduler instance\n\nargs.kd_policy = None\nif args.kd_teacher:\n # Create teacher model - replace this with your model creation code\n teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus)\n if args.kd_resume:\n teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume)\n\n # Create policy and add to scheduler\n dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt)\n args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw)\n compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs,\n frequency=1) Finally, during the training loop, we need to perform forward propagation through the teacher model as well. The KnowledgeDistillationPolicy class keeps a reference to both the student and teacher models, and exposes a forward function that performs forward propagation on both of them. Since this is not one of the standard policy callbacks, we need to call this function manually from our training loop, as follows: if args.kd_policy is None:\n # Revert to a normal forward-prop call if no knowledge distillation policy is present\n output = model(input_var)\nelse:\n output = args.kd_policy.forward(input_var) To see this integration in action, take a look at the image classification sample at examples/classifier_compression/compress_classifier.py .", + "title": "Knowledge Distillation" + }, { "location": "/pruning/index.html", "text": "Pruning\n\n\nA common methodology for inducing sparsity in weights and activations is called \npruning\n. Pruning is the application of a binary criteria to decide which weights to prune: weights which match the pruning criteria are assigned a value of zero. Pruned elements are \"trimmed\" from the model: we zero their values and also make sure they don't take part in the back-propagation process.\n\n\nWe can prune weights, biases, and activations. Biases are few and their contribution to a layer's output is relatively large, so there is little incentive to prune them. We usually see sparse activations following a ReLU layer, because ReLU quenches negative activations to exact zero (\\(ReLU(x): max(0,x)\\)). Sparsity in weights is less common, as weights tend to be very small, but are often not exact zeros.\n\n\n\nLet's define sparsity\n\n\nSparsity is a a measure of how many elements in a tensor are exact zeros, relative to the tensor size. A tensor is considered sparse if \"most\" of its elements are zero. How much is \"most\", is not strictly defined, but when you see a sparse tensor you know it ;-)\n\nThe \n\\(l_0\\)-\"norm\" function\n measures how many zero-elements are in a tensor \nx\n:\n\\[\\lVert x \\rVert_0\\;=\\;|x_1|^0 + |x_2|^0 + ... + |x_n|^0 \\]\nIn other words, an element contributes either a value of 1 or 0 to \\(l_0\\). Anything but an exact zero contributes a value of 1 - that's pretty cool.\n\nSometimes it helps to think about density, the number of non-zero elements (NNZ) and sparsity's complement:\n\\[\ndensity = 1 - sparsity\n\\]\nYou can use \ndistiller.sparsity\n and \ndistiller.density\n to query a PyTorch tensor's sparsity and density.\n\n\nWhat is weights pruning?\n\n\nWeights pruning, or model pruning, is a set of methods to increase the sparsity (amount of zero-valued elements in a tensor) of a network's weights. In general, the term 'parameters' refers to both weights and bias tensors of a model. Biases are rarely, if ever, pruned because there are very few bias elements compared to weights elements, and it is just not worth the trouble.\n\n\nPruning requires a criteria for choosing which elements to prune - this is called the \npruning criteria\n. The most common pruning criteria is the absolute value of each element: the element's absolute value is compared to some threshold value, and if it is below the threshold the element is set to zero (i.e. pruned) . This is implemented by the \ndistiller.MagnitudeParameterPruner\n class. The idea behind this method, is that weights with small \\(l_1\\)-norms (absolute value) contribute little to the final result (low saliency), so they are less important and can be removed.\n\n\nA related idea motivating pruning, is that models are over-parametrized and contain redundant logic and features. Therefore, some of these redundancies can be removed by setting their weights to zero.\n\n\nAnd yet another way to think of pruning is to phrase it as a search for a set of weights with as many zeros as possible, which still produces acceptable inference accuracies compared to the dense-model (non-pruned model). Another way to look at it, is to imagine that because of the very high-dimensionality of the parameter space, the immediate space around the dense-model's solution likely contains some sparse solutions, and we want to use find these sparse solutions. \n\n\n\n\nPruning schedule\n\n\nThe most straight-forward to prune is to take a trained model and prune it once; also called \none-shot pruning\n. In \nLearning both Weights and Connections for Efficient Neural Networks\n Song Han et. al show that this is surprisingly effective, but also leaves a lot of potential sparsity untapped. The surprise is what they call the \"free lunch\" effect: \n\"reducing 2x the connections without losing accuracy even without retraining.\"\n\nHowever, they also note that when employing a pruning-followed-by-retraining regimen, they can achieve much better results (higher sparsity at no accuracy loss). This is called \niterative pruning\n, and the retraining that follows pruning is often referred to as \nfine-tuning\n. How the pruning criteria changes between iterations, how many iterations we perform and how often, and which tensors are pruned - this is collectively called the \npruning schedule\n.\n\n\nWe can think of iterative pruning as repeatedly learning which weights are important, removing the least important ones based on some importance criteria, and then retraining the model to let it \"recover\" from the pruning by adjusting the remaining weights. At each iteration, we prune more weights.\n\nThe decision of when to stop pruning is also expressed in the schedule, and it depends on the pruning algorithm. For example, if we are trying to achieve a specific sparsity level, then we stop when the pruning achieves that level. And if we are pruning weights structures in order to reduce the required compute budget, then we stop the pruning when this compute reduction is achieved.\n\n\nDistiller supports expressing the pruning schedule as a YAML file (which is then executed by an instance of a PruningScheduler).\n\n\nPruning granularity\n\n\nPruning individual weight elements is called \nelement-wise pruning\n, and it is also sometimes referred to as \nfine-grained\n pruning.\n\n\nCoarse-grained pruning\n - also referred to as \nstructured pruning\n, \ngroup pruning\n, or \nblock pruning\n - is pruning entire groups of elements which have some significance. Groups come in various shapes and sizes, but an easy to visualize group-pruning is filter-pruning, in which entire filters are removed.\n\n\nSensitivity analysis\n\n\nThe hard part about inducing sparsity via pruning is determining what threshold, or sparsity level, to use for each layer's tensors. Sensitivity analysis is a method that tries to help us rank the tensors by their sensitivity to pruning. \n\nThe idea is to set the pruning level (percentage) of a specific layer, and then to prune once, run an evaluation on the test dataset and record the accuracy score. We do this for all of the parameterized layers, and for each layer we examine several sparsity levels. This should teach us about the \"sensitivity\" of each of the layers to pruning.\n\n\nThe evaluated model should be trained to maximum accuracy before running the analysis, because we aim to understand the behavior of the trained model's performance in relation to pruning of a specific weights tensor.\n\n\nMuch as we can prune structures, we can also perform sensitivity analysis on structures. Distiller implements element-wise pruning sensitivity analysis using the \\(l_1\\)-norm of individual elements; and filter-wise pruning sensitivity analysis using the mean \\(l_1\\)-norm of filters.\n\n\n\nThe authors of \nPruning Filters for Efficient ConvNets\n describe how they do sensitivity analysis:\n\n\n\n\n\"To understand the sensitivity of each layer, we prune each layer independently and evaluate the resulting pruned network\u2019s accuracy on the validation set. Figure 2(b) shows that layers that maintain their accuracy as filters are pruned away correspond to layers with larger slopes in Figure 2(a). On the contrary, layers with relatively flat slopes are more sensitive to pruning. We empirically determine the number of filters to prune for each layer based on their sensitivity to pruning. For deep networks such as VGG-16 or ResNets, we observe that layers in the same stage (with the same feature map size) have a similar sensitivity to pruning. To avoid introducing layer-wise meta-parameters, we use the same pruning ratio for all layers in the same stage. For layers that are sensitive to pruning, we prune a smaller percentage of these layers or completely skip pruning them.\"\n\n\n\n\nThe diagram below shows the results of running an element-wise sensitivity analysis on Alexnet, using Distillers's \nperform_sensitivity_analysis\n utility function.\n\n\nAs reported by Song Han, and exhibited in the diagram, in Alexnet the feature detecting layers (convolution layers) are more sensitive to pruning, and their sensitivity drops, the deeper they are. The fully-connected layers are much less sensitive, which is great, because that's where most of the parameters are.\n\n\n\n\nReferences\n\n\n \nSong Han, Jeff Pool, John Tran, William J. Dally\n.\n \nLearning both Weights and Connections for Efficient Neural Networks\n,\n arXiv:1607.04381v2,\n 2015.\n\n\n\n\n\nHao Li, Asim Kadav, Igor Durdanovic, Hanan Samet, Hans Peter Graf\n.\n \nPruning Filters for Efficient ConvNets\n,\n arXiv:1608.08710v3,\n 2017.", @@ -252,7 +257,7 @@ }, { "location": "/quantization/index.html", - "text": "Quantization\n\n\nQuantization refers to the process of reducing the number of bits that represent a number. In the context of deep learning, the predominant numerical format used for research and for deployment has so far been 32-bit floating point, or FP32. However, the desire for reduced bandwidth and compute requirements of deep learning models has driven research into using lower-precision numerical formats. It has been extensively demonstrated that weights and activations can be represented using 8-bit integers (or INT8) without incurring significant loss in accuracy. The use of even lower bit-widths, such as 4/2/1-bits, is an active field of research that has also shown great progress.\n\n\nNote that this discussion is on quantization only in the context of more efficient inference. Using lower-precision numerics for more efficient training is currently out of scope.\n\n\nMotivation: Overall Efficiency\n\n\nThe more obvious benefit from quantization is \nsignificantly reduced bandwidth and storage\n. For instance, using INT8 for weights and activations consumes 4x less overall bandwidth compared to FP32.\n\nAdditionally integer compute is \nfaster\n than floating point compute. It is also much more \narea and energy efficient\n: \n\n\n\n\n\n\n\n\nINT8 Operation\n\n\nEnergy Saving vs FP32\n\n\nArea Saving vs FP32\n\n\n\n\n\n\n\n\n\n\nAdd\n\n\n30x\n\n\n116x\n\n\n\n\n\n\nMultiply\n\n\n18.5x\n\n\n27x\n\n\n\n\n\n\n\n\n(\nDally, 2015\n)\n\n\nNote that very aggressive quantization can yield even more efficiency. If weights are binary (-1, 1) or ternary (-1, 0, 1 using 2-bits), then convolution and fully-connected layers can be computed with additions and subtractions only, removing multiplications completely. If activations are binary as well, then additions can also be removed, in favor of bitwise operations (\nRastegari et al., 2016\n).\n\n\nInteger vs. FP32\n\n\nThere are two main attributes when discussing a numerical format. The first is \ndynamic range\n, which refers to the range of representable numbers. The second one is how many values can be represented within the dynamic range, which in turn determines the \nprecision / resolution\n of the format (the distance between two numbers).\n\nFor all integer formats, the dynamic range is \n[-2^{n-1} .. 2^{n-1}-1]\n, where \nn\n is the number of bits. So for INT8 the range is \n[-128 .. 127]\n, and for INT4 it is \n[-16 .. 15]\n (we're limiting ourselves to signed integers for now). The number of representable values is \n2^n\n.\nContrast that with FP32, where the dynamic range is \n\\pm 3.4\\ x\\ 10^{38}\n, and approximately \n4.2\\ x\\ 10^9\n values can be represented.\n\nWe can immediately see that FP32 is much more \nversatile\n, in that it is able to represent a wide range of distributions accurately. This is a nice property for deep learning models, where the distributions of weights and activations are usually very different (at least in dynamic range). In addition the dynamic range can differ between layers in the model.\n\nIn order to be able to represent these different distributions with an integer format, a \nscale factor\n is used to map the dynamic range of the tensor to the integer format range. But still we remain with the issue of having a significantly lower number of representable values, that is - much lower resolution.\n\nNote that this scale factor is, in most cases, a floating-point number. Hence, even when using integer numerics, some floating-point computations remain. \nCourbariaux et al., 2014\n scale using only shifts, eliminating the floating point operation. In \nGEMMLWOP\n, the FP32 scale factor is approximated using an integer or fixed-point multiplication followed by a shift operation. In many cases the effect of this approximation on accuracy is negligible.\n\n\nAvoiding Overflows\n\n\nConvolution and fully connected layers involve the storing of intermediate results in accumulators. Due to the limited dynamic range of integer formats, if we would use the same bit-width for the weights and activation, \nand\n for the accumulators, we would likely overflow very quickly. Therefore, accumulators are usually implemented with higher bit-widths.\n\nThe result of multiplying two \nn\n-bit integers is, at most, a \n2n\n-bit number. In convolution layers, such multiplications are accumulated \nc\\cdot k^2\n times, where \nc\n is the number of input channels and \nk\n is the kernel width (assuming a square kernel). Hence, to avoid overflowing, the accumulator should be \n2n + M\n-bits wide, where M is at least \nlog_2(c\\cdot k^2)\n. In many cases 32-bit accumulators are used, however for INT4 and lower it might be possible to use less than 32 -bits, depending on the expected use cases and layer widths.\n\n\n\"Conservative\" Quantization: INT8\n\n\nIn many cases, taking a model trained for FP32 and directly quantizing it to INT8, without any re-training, can result in a relatively low loss of accuracy (which may or may not be acceptable, depending on the use case). Some fine-tuning can further improve the accuracy (\nGysel at al., 2018\n).\n\nAs mentioned above, a scale factor is used to adapt the dynamic range of the tensor at hand to that of the integer format. This scale factor needs to be calculated per-layer per-tensor. The simplest way is to map the min/max values of the float tensor to the min/max of the integer format. For weights and biases this is easy, as they are set once training is complete. For activations, the min/max float values can be obtained \"online\" during inference, or \"offline\".\n\n\n\n\nOffline\n means gathering activations statistics before deploying the model, either during training or by running a few \"calibration\" batches on the trained FP32 model. Based on these gathered statistics, the scaled factors are calculated and are fixed once the model is deployed. This method has the risk of encountering values outside the previously observed ranges at runtime. These values will be clipped, which might lead to accuracy degradation.\n\n\nOnline\n means calculating the min/max values for each tensor dynamically during runtime. In this method clipping cannot occur, however the added computation resources required to calculate the min/max values at runtime might be prohibitive.\n\n\n\n\nIt is important to note, however, that the full float range of an activations tensor usually includes elements which are statistically outliers. These values can be discarded by using a narrower min/max range, effectively allowing some clipping to occur in favor of increasing the resolution provided to the part of the distribution containing most of the information. Statistical measures can be used to intelligently select where to clip the original range in order to preserve as much information as possible (\nMigacz, 2017\n). \n\n\nAnother possible optimization point is \nscale-factor scope\n. The most common way is use a single scale-factor per-layer, but it is also possible to calculate a scale-factor per-channel. This can be beneficial if the weight distributions vary greatly between channels.\n\n\n\"Aggressive\" Quantization: INT4 and Lower\n\n\nNaively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy:\n\n\n\n\nTraining / Re-Training\n: For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the \nnext section\n.\n\n\nZhou S et al., 2016\n have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods \nrequire\n a trained FP32 model, either as a starting point (\nZhou A et al., 2017\n), or as a teacher network in a knowledge distillation training setup (\nMishra and Marr, 2018\n).\n\n\nReplacing the activation function\n: The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used (\nZhou S et al., 2016\n, \nMishra et al., 2018\n). Another method learns the clipping value per layer, with better results (\nChoi et al., 2018\n). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above).\n\n\nModifying network structure\n: \nMishra et al., 2018\n try to compensate for the loss of information due to quantization by using wider layers (more channels). \nLin et al., 2017\n proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different \"base\", covering a larger dynamic range overall.\n\n\nFirst and last layer\n: Many methods do not quantize the first and last layer of the model. It has been observed by \nHan et al., 2015\n that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically (\nZhou S et al., 2016\n, \nChoi et al., 2018\n). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them (\nRastegari et al., 2016\n). Most methods keep the first and last layers at FP32. However, \nChoi et al., 2018\n showed that \"conservative\" quantization of these layers, e.g. to INT8, does not reduce accuracy.\n\n\nIterative quantization\n: Most methods quantize the entire model at once. \nZhou A et al., 2017\n employ an iterative method, which starts with a trained FP32 baseline, and quantizes only a portion of the model at the time followed by several epochs of re-training to recover the accuracy loss from quantization.\n\n\nMixed Weights and Activations Precision\n: It has been observed that activations are more sensitive to quantization than weights (\nZhou S et al., 2016\n). Hence it is not uncommon to see experiments with activations quantized to a higher precision compared to weights. Some works have focused solely on quantizing weights, keeping the activations at FP32 (\nLi et al., 2016\n, \nZhu et al., 2016\n).\n\n\n\n\nTraining with Quantization\n\n\nAs mentioned above, in order to minimize the loss of accuracy from \"aggressive\" quantization, many methods that target INT4 and lower involve training the model in a way that considers the quantization. This means training with quantization of weights and activations \"baked\" into the training procedure. The training graph usually looks like this:\n\n\n\n\nA full precision copy of the weights is maintained throughout the training process (\"weights_fp\" in the diagram). Its purpose is to accumulate the small changes from the gradients without loss of precision (Note that the quantization of the weights is an integral part of the training graph, meaning that we back-propagate through it as well). Once the model is trained, only the quantized weights are used for inference.\n\nIn the diagram we show \"layer N\" as the conv + batch-norm + activation combination, but the same applies to fully-connected layers, element-wise operations, etc. During training, the operations within \"layer N\" can still run in full precision, with the \"quantize\" operations in the boundaries ensuring discrete-valued weights and activations. This is sometimes called \"simulated quantization\". \n\n\nStraight-Through Estimator\n\n\nAn important question in this context is how to back-propagate through the quantization functions. These functions are discrete-valued, hence their derivative is 0 almost everywhere. So, using their gradients as-is would severly hinder the learning process. An approximation commonly used to overcome this issue is the \"straight-through estimator\" (STE) (\nHinton et al., 2012\n, \nBengio, 2013\n), which simply passes the gradient through these functions as-is. \n\n\nReferences\n\n\n\n\nWilliam Dally\n. High-Performance Hardware for Machine Learning. \nTutorial, NIPS, 2015\n\n\n\n\n\nMohammad Rastegari, Vicente Ordone, Joseph Redmon and Ali Farhadi\n. XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. \nECCV, 2016\n\n\n\n\n\nMatthieu Courbariaux, Yoshua Bengio and Jean-Pierre David\n. Training deep neural networks with low precision multiplications. \narxiv:1412.7024\n\n\n\n\n\nPhilipp Gysel, Jon Pimentel, Mohammad Motamedi and Soheil Ghiasi\n. Ristretto: A Framework for Empirical Study of Resource-Efficient Inference in Convolutional Neural Networks. \nIEEE Transactions on Neural Networks and Learning Systems, 2018\n\n\n\n\n\nSzymon Migacz\n. 8-bit Inference with TensorRT. \nGTC San Jose, 2017\n\n\n\n\n\nShuchang Zhou, Zekun Ni, Xinyu Zhou, He Wen, Yuxin Wu and Yuheng Zou\n. DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. \narxiv:1606.06160\n\n\n\n\n\nAojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen\n. Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. \nICLR, 2017\n\n\n\n\n\nAsit Mishra and Debbie Marr\n. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. \nICLR, 2018\n\n\n\n\n\nAsit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr\n. WRPN: Wide Reduced-Precision Networks. \nICLR, 2018\n\n\n\n\n\nJungwook Choi, Zhuo Wang, Swagath Venkataramani, Pierce I-Jen Chuang, Vijayalakshmi Srinivasan and Kailash Gopalakrishnan\n. PACT: Parameterized Clipping Activation for Quantized Neural Networks. \n2018\n\n\n\n\n\nXiaofan Lin, Cong Zhao and Wei Pan\n. Towards Accurate Binary Convolutional Neural Network. \nNIPS, 2017\n\n\n\n\n\nSong Han, Jeff Pool, John Tran and William Dally\n. Learning both Weights and Connections for Efficient Neural Network. \nNIPS, 2015\n\n\n\n\n\nFengfu Li, Bo Zhang and Bin Liu\n. Ternary Weight Networks. \narxiv:1605.04711\n\n\n\n\n\nChenzhuo Zhu, Song Han, Huizi Mao and William J. Dally\n. Trained Ternary Quantization. \narxiv:1612.01064\n\n\n\n\n\nYoshua Bengio, Nicholas Leonard and Aaron Courville\n. Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation. \narxiv:1308.3432, 2013\n\n\n\n\n\nGeoffrey Hinton, Nitish Srivastava, Kevin Swersky, Tijmen Tieleman and Abdelrahman Mohamed\n. Neural Networks for Machine Learning. \nCoursera, video lectures, 2012", + "text": "Quantization\n\n\nQuantization refers to the process of reducing the number of bits that represent a number. In the context of deep learning, the predominant numerical format used for research and for deployment has so far been 32-bit floating point, or FP32. However, the desire for reduced bandwidth and compute requirements of deep learning models has driven research into using lower-precision numerical formats. It has been extensively demonstrated that weights and activations can be represented using 8-bit integers (or INT8) without incurring significant loss in accuracy. The use of even lower bit-widths, such as 4/2/1-bits, is an active field of research that has also shown great progress.\n\n\nNote that this discussion is on quantization only in the context of more efficient inference. Using lower-precision numerics for more efficient training is currently out of scope.\n\n\nMotivation: Overall Efficiency\n\n\nThe more obvious benefit from quantization is \nsignificantly reduced bandwidth and storage\n. For instance, using INT8 for weights and activations consumes 4x less overall bandwidth compared to FP32.\n\nAdditionally integer compute is \nfaster\n than floating point compute. It is also much more \narea and energy efficient\n: \n\n\n\n\n\n\n\n\nINT8 Operation\n\n\nEnergy Saving vs FP32\n\n\nArea Saving vs FP32\n\n\n\n\n\n\n\n\n\n\nAdd\n\n\n30x\n\n\n116x\n\n\n\n\n\n\nMultiply\n\n\n18.5x\n\n\n27x\n\n\n\n\n\n\n\n\n(\nDally, 2015\n)\n\n\nNote that very aggressive quantization can yield even more efficiency. If weights are binary (-1, 1) or ternary (-1, 0, 1 using 2-bits), then convolution and fully-connected layers can be computed with additions and subtractions only, removing multiplications completely. If activations are binary as well, then additions can also be removed, in favor of bitwise operations (\nRastegari et al., 2016\n).\n\n\nInteger vs. FP32\n\n\nThere are two main attributes when discussing a numerical format. The first is \ndynamic range\n, which refers to the range of representable numbers. The second one is how many values can be represented within the dynamic range, which in turn determines the \nprecision / resolution\n of the format (the distance between two numbers).\n\nFor all integer formats, the dynamic range is \n[-2^{n-1} .. 2^{n-1}-1]\n, where \nn\n is the number of bits. So for INT8 the range is \n[-128 .. 127]\n, and for INT4 it is \n[-16 .. 15]\n (we're limiting ourselves to signed integers for now). The number of representable values is \n2^n\n.\nContrast that with FP32, where the dynamic range is \n\\pm 3.4\\ x\\ 10^{38}\n, and approximately \n4.2\\ x\\ 10^9\n values can be represented.\n\nWe can immediately see that FP32 is much more \nversatile\n, in that it is able to represent a wide range of distributions accurately. This is a nice property for deep learning models, where the distributions of weights and activations are usually very different (at least in dynamic range). In addition the dynamic range can differ between layers in the model.\n\nIn order to be able to represent these different distributions with an integer format, a \nscale factor\n is used to map the dynamic range of the tensor to the integer format range. But still we remain with the issue of having a significantly lower number of representable values, that is - much lower resolution.\n\nNote that this scale factor is, in most cases, a floating-point number. Hence, even when using integer numerics, some floating-point computations remain. \nCourbariaux et al., 2014\n scale using only shifts, eliminating the floating point operation. In \nGEMMLWOP\n, the FP32 scale factor is approximated using an integer or fixed-point multiplication followed by a shift operation. In many cases the effect of this approximation on accuracy is negligible.\n\n\nAvoiding Overflows\n\n\nConvolution and fully connected layers involve the storing of intermediate results in accumulators. Due to the limited dynamic range of integer formats, if we would use the same bit-width for the weights and activation, \nand\n for the accumulators, we would likely overflow very quickly. Therefore, accumulators are usually implemented with higher bit-widths.\n\nThe result of multiplying two \nn\n-bit integers is, at most, a \n2n\n-bit number. In convolution layers, such multiplications are accumulated \nc\\cdot k^2\n times, where \nc\n is the number of input channels and \nk\n is the kernel width (assuming a square kernel). Hence, to avoid overflowing, the accumulator should be \n2n + M\n-bits wide, where M is at least \nlog_2(c\\cdot k^2)\n. In many cases 32-bit accumulators are used, however for INT4 and lower it might be possible to use less than 32 -bits, depending on the expected use cases and layer widths.\n\n\n\"Conservative\" Quantization: INT8\n\n\nIn many cases, taking a model trained for FP32 and directly quantizing it to INT8, without any re-training, can result in a relatively low loss of accuracy (which may or may not be acceptable, depending on the use case). Some fine-tuning can further improve the accuracy (\nGysel at al., 2018\n).\n\nAs mentioned above, a scale factor is used to adapt the dynamic range of the tensor at hand to that of the integer format. This scale factor needs to be calculated per-layer per-tensor. The simplest way is to map the min/max values of the float tensor to the min/max of the integer format. For weights and biases this is easy, as they are set once training is complete. For activations, the min/max float values can be obtained \"online\" during inference, or \"offline\".\n\n\n\n\nOffline\n means gathering activations statistics before deploying the model, either during training or by running a few \"calibration\" batches on the trained FP32 model. Based on these gathered statistics, the scaled factors are calculated and are fixed once the model is deployed. This method has the risk of encountering values outside the previously observed ranges at runtime. These values will be clipped, which might lead to accuracy degradation.\n\n\nOnline\n means calculating the min/max values for each tensor dynamically during runtime. In this method clipping cannot occur, however the added computation resources required to calculate the min/max values at runtime might be prohibitive.\n\n\n\n\nIt is important to note, however, that the full float range of an activations tensor usually includes elements which are statistically outliers. These values can be discarded by using a narrower min/max range, effectively allowing some clipping to occur in favor of increasing the resolution provided to the part of the distribution containing most of the information. Statistical measures can be used to intelligently select where to clip the original range in order to preserve as much information as possible (\nMigacz, 2017\n). \n\n\nAnother possible optimization point is \nscale-factor scope\n. The most common way is use a single scale-factor per-layer, but it is also possible to calculate a scale-factor per-channel. This can be beneficial if the weight distributions vary greatly between channels.\n\n\n\"Aggressive\" Quantization: INT4 and Lower\n\n\nNaively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy:\n\n\n\n\nTraining / Re-Training\n: For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the \nnext section\n.\n\n\nZhou S et al., 2016\n have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods \nrequire\n a trained FP32 model, either as a starting point (\nZhou A et al., 2017\n), or as a teacher network in a knowledge distillation training setup (see \nhere\n).\n\n\nReplacing the activation function\n: The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used (\nZhou S et al., 2016\n, \nMishra et al., 2018\n). Another method learns the clipping value per layer, with better results (\nChoi et al., 2018\n). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above).\n\n\nModifying network structure\n: \nMishra et al., 2018\n try to compensate for the loss of information due to quantization by using wider layers (more channels). \nLin et al., 2017\n proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different \"base\", covering a larger dynamic range overall.\n\n\nFirst and last layer\n: Many methods do not quantize the first and last layer of the model. It has been observed by \nHan et al., 2015\n that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically (\nZhou S et al., 2016\n, \nChoi et al., 2018\n). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them (\nRastegari et al., 2016\n). Most methods keep the first and last layers at FP32. However, \nChoi et al., 2018\n showed that \"conservative\" quantization of these layers, e.g. to INT8, does not reduce accuracy.\n\n\nIterative quantization\n: Most methods quantize the entire model at once. \nZhou A et al., 2017\n employ an iterative method, which starts with a trained FP32 baseline, and quantizes only a portion of the model at the time followed by several epochs of re-training to recover the accuracy loss from quantization.\n\n\nMixed Weights and Activations Precision\n: It has been observed that activations are more sensitive to quantization than weights (\nZhou S et al., 2016\n). Hence it is not uncommon to see experiments with activations quantized to a higher precision compared to weights. Some works have focused solely on quantizing weights, keeping the activations at FP32 (\nLi et al., 2016\n, \nZhu et al., 2016\n).\n\n\n\n\nTraining with Quantization\n\n\nAs mentioned above, in order to minimize the loss of accuracy from \"aggressive\" quantization, many methods that target INT4 and lower involve training the model in a way that considers the quantization. This means training with quantization of weights and activations \"baked\" into the training procedure. The training graph usually looks like this:\n\n\n\n\nA full precision copy of the weights is maintained throughout the training process (\"weights_fp\" in the diagram). Its purpose is to accumulate the small changes from the gradients without loss of precision (Note that the quantization of the weights is an integral part of the training graph, meaning that we back-propagate through it as well). Once the model is trained, only the quantized weights are used for inference.\n\nIn the diagram we show \"layer N\" as the conv + batch-norm + activation combination, but the same applies to fully-connected layers, element-wise operations, etc. During training, the operations within \"layer N\" can still run in full precision, with the \"quantize\" operations in the boundaries ensuring discrete-valued weights and activations. This is sometimes called \"simulated quantization\". \n\n\nStraight-Through Estimator\n\n\nAn important question in this context is how to back-propagate through the quantization functions. These functions are discrete-valued, hence their derivative is 0 almost everywhere. So, using their gradients as-is would severly hinder the learning process. An approximation commonly used to overcome this issue is the \"straight-through estimator\" (STE) (\nHinton et al., 2012\n, \nBengio, 2013\n), which simply passes the gradient through these functions as-is. \n\n\nReferences\n\n\n\n\nWilliam Dally\n. High-Performance Hardware for Machine Learning. \nTutorial, NIPS, 2015\n\n\n\n\n\nMohammad Rastegari, Vicente Ordone, Joseph Redmon and Ali Farhadi\n. XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. \nECCV, 2016\n\n\n\n\n\nMatthieu Courbariaux, Yoshua Bengio and Jean-Pierre David\n. Training deep neural networks with low precision multiplications. \narxiv:1412.7024\n\n\n\n\n\nPhilipp Gysel, Jon Pimentel, Mohammad Motamedi and Soheil Ghiasi\n. Ristretto: A Framework for Empirical Study of Resource-Efficient Inference in Convolutional Neural Networks. \nIEEE Transactions on Neural Networks and Learning Systems, 2018\n\n\n\n\n\nSzymon Migacz\n. 8-bit Inference with TensorRT. \nGTC San Jose, 2017\n\n\n\n\n\nShuchang Zhou, Zekun Ni, Xinyu Zhou, He Wen, Yuxin Wu and Yuheng Zou\n. DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. \narxiv:1606.06160\n\n\n\n\n\nAojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen\n. Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. \nICLR, 2017\n\n\n\n\n\nAsit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr\n. WRPN: Wide Reduced-Precision Networks. \nICLR, 2018\n\n\n\n\n\nJungwook Choi, Zhuo Wang, Swagath Venkataramani, Pierce I-Jen Chuang, Vijayalakshmi Srinivasan and Kailash Gopalakrishnan\n. PACT: Parameterized Clipping Activation for Quantized Neural Networks. \n2018\n\n\n\n\n\nXiaofan Lin, Cong Zhao and Wei Pan\n. Towards Accurate Binary Convolutional Neural Network. \nNIPS, 2017\n\n\n\n\n\nSong Han, Jeff Pool, John Tran and William Dally\n. Learning both Weights and Connections for Efficient Neural Network. \nNIPS, 2015\n\n\n\n\n\nFengfu Li, Bo Zhang and Bin Liu\n. Ternary Weight Networks. \narxiv:1605.04711\n\n\n\n\n\nChenzhuo Zhu, Song Han, Huizi Mao and William J. Dally\n. Trained Ternary Quantization. \narxiv:1612.01064\n\n\n\n\n\nYoshua Bengio, Nicholas Leonard and Aaron Courville\n. Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation. \narxiv:1308.3432, 2013\n\n\n\n\n\nGeoffrey Hinton, Nitish Srivastava, Kevin Swersky, Tijmen Tieleman and Abdelrahman Mohamed\n. Neural Networks for Machine Learning. \nCoursera, video lectures, 2012", "title": "Quantization" }, { @@ -282,7 +287,7 @@ }, { "location": "/quantization/index.html#aggressive-quantization-int4-and-lower", - "text": "Naively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy: Training / Re-Training : For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the next section . Zhou S et al., 2016 have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods require a trained FP32 model, either as a starting point ( Zhou A et al., 2017 ), or as a teacher network in a knowledge distillation training setup ( Mishra and Marr, 2018 ). Replacing the activation function : The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used ( Zhou S et al., 2016 , Mishra et al., 2018 ). Another method learns the clipping value per layer, with better results ( Choi et al., 2018 ). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above). Modifying network structure : Mishra et al., 2018 try to compensate for the loss of information due to quantization by using wider layers (more channels). Lin et al., 2017 proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different \"base\", covering a larger dynamic range overall. First and last layer : Many methods do not quantize the first and last layer of the model. It has been observed by Han et al., 2015 that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically ( Zhou S et al., 2016 , Choi et al., 2018 ). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them ( Rastegari et al., 2016 ). Most methods keep the first and last layers at FP32. However, Choi et al., 2018 showed that \"conservative\" quantization of these layers, e.g. to INT8, does not reduce accuracy. Iterative quantization : Most methods quantize the entire model at once. Zhou A et al., 2017 employ an iterative method, which starts with a trained FP32 baseline, and quantizes only a portion of the model at the time followed by several epochs of re-training to recover the accuracy loss from quantization. Mixed Weights and Activations Precision : It has been observed that activations are more sensitive to quantization than weights ( Zhou S et al., 2016 ). Hence it is not uncommon to see experiments with activations quantized to a higher precision compared to weights. Some works have focused solely on quantizing weights, keeping the activations at FP32 ( Li et al., 2016 , Zhu et al., 2016 ).", + "text": "Naively quantizing a FP32 model to INT4 and lower usually incurs significant accuracy degradation. Many works have tried to mitigate this effect. They usually employ one or more of the following concepts in order to improve model accuracy: Training / Re-Training : For INT4 and lower, training is required in order to obtain reasonable accuracy. The training loop is modified to take quantization into account. See details in the next section . Zhou S et al., 2016 have shown that bootstrapping the quantized model with trained FP32 weights leads to higher accuracy, as opposed to training from scratch. Other methods require a trained FP32 model, either as a starting point ( Zhou A et al., 2017 ), or as a teacher network in a knowledge distillation training setup (see here ). Replacing the activation function : The most common activation function in vision models is ReLU, which is unbounded. That is - its dynamic range is not limited for positive inputs. This is very problematic for INT4 and below due to the very limited range and resolution. Therefore, most methods replace ReLU with another function which is bounded. In some cases a clipping function with hard coded values is used ( Zhou S et al., 2016 , Mishra et al., 2018 ). Another method learns the clipping value per layer, with better results ( Choi et al., 2018 ). Once the clipping value is set, the scale factor used for quantization is also set, and no further calibration steps are required (as opposed to INT8 methods described above). Modifying network structure : Mishra et al., 2018 try to compensate for the loss of information due to quantization by using wider layers (more channels). Lin et al., 2017 proposed a binary quantization method in which a single FP32 convolution is replaced with multiple binary convolutions, each scaled to represent a different \"base\", covering a larger dynamic range overall. First and last layer : Many methods do not quantize the first and last layer of the model. It has been observed by Han et al., 2015 that the first convolutional layer is more sensitive to weights pruning, and some quantization works cite the same reason and show it empirically ( Zhou S et al., 2016 , Choi et al., 2018 ). Some works also note that these layers usually constitute a very small portion of the overall computation within the model, further reducing the motivation to quantize them ( Rastegari et al., 2016 ). Most methods keep the first and last layers at FP32. However, Choi et al., 2018 showed that \"conservative\" quantization of these layers, e.g. to INT8, does not reduce accuracy. Iterative quantization : Most methods quantize the entire model at once. Zhou A et al., 2017 employ an iterative method, which starts with a trained FP32 baseline, and quantizes only a portion of the model at the time followed by several epochs of re-training to recover the accuracy loss from quantization. Mixed Weights and Activations Precision : It has been observed that activations are more sensitive to quantization than weights ( Zhou S et al., 2016 ). Hence it is not uncommon to see experiments with activations quantized to a higher precision compared to weights. Some works have focused solely on quantizing weights, keeping the activations at FP32 ( Li et al., 2016 , Zhu et al., 2016 ).", "title": "\"Aggressive\" Quantization: INT4 and Lower" }, { @@ -297,7 +302,27 @@ }, { "location": "/quantization/index.html#references", - "text": "William Dally . High-Performance Hardware for Machine Learning. Tutorial, NIPS, 2015 Mohammad Rastegari, Vicente Ordone, Joseph Redmon and Ali Farhadi . XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. ECCV, 2016 Matthieu Courbariaux, Yoshua Bengio and Jean-Pierre David . Training deep neural networks with low precision multiplications. arxiv:1412.7024 Philipp Gysel, Jon Pimentel, Mohammad Motamedi and Soheil Ghiasi . Ristretto: A Framework for Empirical Study of Resource-Efficient Inference in Convolutional Neural Networks. IEEE Transactions on Neural Networks and Learning Systems, 2018 Szymon Migacz . 8-bit Inference with TensorRT. GTC San Jose, 2017 Shuchang Zhou, Zekun Ni, Xinyu Zhou, He Wen, Yuxin Wu and Yuheng Zou . DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. arxiv:1606.06160 Aojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen . Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. ICLR, 2017 Asit Mishra and Debbie Marr . Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. ICLR, 2018 Asit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr . WRPN: Wide Reduced-Precision Networks. ICLR, 2018 Jungwook Choi, Zhuo Wang, Swagath Venkataramani, Pierce I-Jen Chuang, Vijayalakshmi Srinivasan and Kailash Gopalakrishnan . PACT: Parameterized Clipping Activation for Quantized Neural Networks. 2018 Xiaofan Lin, Cong Zhao and Wei Pan . Towards Accurate Binary Convolutional Neural Network. NIPS, 2017 Song Han, Jeff Pool, John Tran and William Dally . Learning both Weights and Connections for Efficient Neural Network. NIPS, 2015 Fengfu Li, Bo Zhang and Bin Liu . Ternary Weight Networks. arxiv:1605.04711 Chenzhuo Zhu, Song Han, Huizi Mao and William J. Dally . Trained Ternary Quantization. arxiv:1612.01064 Yoshua Bengio, Nicholas Leonard and Aaron Courville . Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation. arxiv:1308.3432, 2013 Geoffrey Hinton, Nitish Srivastava, Kevin Swersky, Tijmen Tieleman and Abdelrahman Mohamed . Neural Networks for Machine Learning. Coursera, video lectures, 2012", + "text": "William Dally . High-Performance Hardware for Machine Learning. Tutorial, NIPS, 2015 Mohammad Rastegari, Vicente Ordone, Joseph Redmon and Ali Farhadi . XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. ECCV, 2016 Matthieu Courbariaux, Yoshua Bengio and Jean-Pierre David . Training deep neural networks with low precision multiplications. arxiv:1412.7024 Philipp Gysel, Jon Pimentel, Mohammad Motamedi and Soheil Ghiasi . Ristretto: A Framework for Empirical Study of Resource-Efficient Inference in Convolutional Neural Networks. IEEE Transactions on Neural Networks and Learning Systems, 2018 Szymon Migacz . 8-bit Inference with TensorRT. GTC San Jose, 2017 Shuchang Zhou, Zekun Ni, Xinyu Zhou, He Wen, Yuxin Wu and Yuheng Zou . DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. arxiv:1606.06160 Aojun Zhou, Anbang Yao, Yiwen Guo, Lin Xu and Yurong Chen . Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights. ICLR, 2017 Asit Mishra, Eriko Nurvitadhi, Jeffrey J Cook and Debbie Marr . WRPN: Wide Reduced-Precision Networks. ICLR, 2018 Jungwook Choi, Zhuo Wang, Swagath Venkataramani, Pierce I-Jen Chuang, Vijayalakshmi Srinivasan and Kailash Gopalakrishnan . PACT: Parameterized Clipping Activation for Quantized Neural Networks. 2018 Xiaofan Lin, Cong Zhao and Wei Pan . Towards Accurate Binary Convolutional Neural Network. NIPS, 2017 Song Han, Jeff Pool, John Tran and William Dally . Learning both Weights and Connections for Efficient Neural Network. NIPS, 2015 Fengfu Li, Bo Zhang and Bin Liu . Ternary Weight Networks. arxiv:1605.04711 Chenzhuo Zhu, Song Han, Huizi Mao and William J. Dally . Trained Ternary Quantization. arxiv:1612.01064 Yoshua Bengio, Nicholas Leonard and Aaron Courville . Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation. arxiv:1308.3432, 2013 Geoffrey Hinton, Nitish Srivastava, Kevin Swersky, Tijmen Tieleman and Abdelrahman Mohamed . Neural Networks for Machine Learning. Coursera, video lectures, 2012", + "title": "References" + }, + { + "location": "/knowledge_distillation/index.html", + "text": "Knowledge Distillation\n\n\n(For details on how to train a model with knowledge distillation in Distiller, see \nhere\n)\n\n\nKnowledge distillation is model compression method in which a small model is trained to mimic a pre-trained, larger model (or ensemble of models). This training setting is sometimes referred to as \"teacher-student\", where the large model is the teacher and the small model is the student (we'll be using these terms interchangeably).\n\n\nThe method was first proposed by \nBucila et al., 2006\n and generalized by \nHinton et al., 2015\n. The implementation in Distiller is based on the latter publication. Here we'll provide a summary of the method. For more information the reader may refer to the paper (a \nvideo lecture\n with \nslides\n is also available).\n\n\nIn distillation, knowledge is transferred from the teacher model to the student by minimizing a loss function in which the target is the distribution of class probabilities predicted by the teacher model. That is - the output of a softmax function on the teacher model's logits. However, in many cases, this probability distribution has the correct class at a very high probability, with all other class probabilities very close to 0. As such, it doesn't provide much information beyond the ground truth labels already provided in the dataset. To tackle this issue, \nHinton et al., 2015\n introduced the concept of \"softmax temperature\". The probability \np_i\n of class \ni\n is calculated from the logits \nz\n as:\n\n\n\n\np_i = \\frac{exp\\left(\\frac{z_i}{T}\\right)}{\\sum_{j} \\exp\\left(\\frac{z_j}{T}\\right)}\n\n\n\n\nwhere \nT\n is the temperature parameter. When \nT=1\n we get the standard softmax function. As \nT\n grows, the probability distribution generated by the softmax function becomes softer, providing more information as to which classes the teacher found more similar to the predicted class. Hinton calls this the \"dark knowledge\" embedded in the teacher model, and it is this dark knowledge that we are transferring to the student model in the distillation process. When computing the loss function vs. the teacher's soft targets, we use the same value of \nT\n to compute the softmax on the student's logits. We call this loss the \"distillation loss\".\n\n\nHinton et al., 2015\n found that it is also beneficial to train the distilled model to produce the correct labels (based on the ground truth) in addition to the teacher's soft-labels. Hence, we also calculate the \"standard\" loss between the student's predicted class probabilities and the ground-truth labels (also called \"hard labels/targets\"). We dub this loss the \"student loss\". When calculating the class probabilities for the student loss we use \nT = 1\n. \n\n\nThe overall loss function, incorporating both distillation and student losses, is calculated as:\n\n\n\n\n\\mathcal{L}(x;W) = \\alpha * \\mathcal{H}(y, \\sigma(z_s; T=1)) + \\beta * \\mathcal{H}(\\sigma(z_t; T=\\tau), \\sigma(z_s, T=\\tau))\n\n\n\n\nwhere \nx\n is the input, \nW\n are the student model parameters, \ny\n is the ground truth label, \n\\mathcal{H}\n is the cross-entropy loss function, \n\\sigma\n is the softmax function parameterized by the temperature \nT\n, and \n\\alpha\n and \n\\beta\n are coefficients. \nz_s\n and \nz_t\n are the logits of the student and teacher respectively.\n\n\n\n\nNew Hyper-Parameters\n\n\nIn general \n\\tau\n, \n\\alpha\n and \n\\beta\n are hyper parameters.\n\n\nIn their experiments, \nHinton et al., 2015\n use temperature values ranging from 1 to 20. They note that empirically, when the student model is very small compared to the teacher model, lower temperatures work better. This makes sense if we consider that as we raise the temperature, the resulting soft-labels distribution becomes richer in information, and a very small model might not be able to capture all of this information. However, there's no clear way to predict up front what kind of capacity for information the student model will have.\n\n\nWith regards to \n\\alpha\n and \n\\beta\n, \nHinton et al., 2015\n use a weighted average between the distillation loss and the student loss. That is, \n\\beta = 1 - \\alpha\n. They note that in general, they obtained the best results when setting \n\\alpha\n to be much smaller than \n\\beta\n (although in one of their experiments they use \n\\alpha = \\beta = 0.5\n). Other works which utilize knowledge distillation don't use a weighted average. Some set \n\\alpha = 1\n while leaving \n\\beta\n tunable, while others don't set any constraints.\n\n\nCombining with Other Model Compression Techniques\n\n\nIn the \"basic\" scenario, the smaller (student) model is a pre-defined architecture which just has a smaller number of parameters compared to the teacher model. For example, we could train ResNet-18 by distilling knowledge from ResNet-34. But, a model with smaller capacity can also be obtained by other model compression techniques - sparsification and/or quantization. So, for example, we could train a 4-bit ResNet-18 model with some method using quantization-aware training, and use a distillation loss function as described above. In that case, the teacher model can even be a FP32 ResNet-18 model. Same goes for pruning and regularization.\n\n\nTann et al., 2017\n, \nMishra and Marr, 2018\n and \nPolino et al., 2018\n are some works that combine knowledge distillation with \nquantization\n. \nTheis et al., 2018\n and \nAshok et al., 2018\n combine distillation with \npruning\n.\n\n\nReferences\n\n\n\n\nCristian Bucila, Rich Caruana, and Alexandru Niculescu-Mizil\n. Model Compression. \nKDD, 2006\n\n\n\n\n\nGeoffrey Hinton, Oriol Vinyals and Jeff Dean\n. Distilling the Knowledge in a Neural Network. \narxiv:1503.02531\n\n\n\n\n\nHokchhay Tann, Soheil Hashemi, Iris Bahar and Sherief Reda\n. Hardware-Software Codesign of Accurate, Multiplier-free Deep Neural Networks. \nDAC, 2017\n\n\n\n\n\nAsit Mishra and Debbie Marr\n. Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. \nICLR, 2018\n\n\n\n\n\nAntonio Polino, Razvan Pascanu and Dan Alistarh\n. Model compression via distillation and quantization. \nICLR, 2018\n\n\n\n\n\nAnubhav Ashok, Nicholas Rhinehart, Fares Beainy and Kris M. Kitani\n. N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning. \nICLR, 2018\n\n\n\n\n\nLucas Theis, Iryna Korshunova, Alykhan Tejani and Ferenc Husz\u00e1r\n. Faster gaze prediction with dense networks and Fisher pruning. \narxiv:1801.05787", + "title": "Knowledge Distillation" + }, + { + "location": "/knowledge_distillation/index.html#knowledge-distillation", + "text": "(For details on how to train a model with knowledge distillation in Distiller, see here ) Knowledge distillation is model compression method in which a small model is trained to mimic a pre-trained, larger model (or ensemble of models). This training setting is sometimes referred to as \"teacher-student\", where the large model is the teacher and the small model is the student (we'll be using these terms interchangeably). The method was first proposed by Bucila et al., 2006 and generalized by Hinton et al., 2015 . The implementation in Distiller is based on the latter publication. Here we'll provide a summary of the method. For more information the reader may refer to the paper (a video lecture with slides is also available). In distillation, knowledge is transferred from the teacher model to the student by minimizing a loss function in which the target is the distribution of class probabilities predicted by the teacher model. That is - the output of a softmax function on the teacher model's logits. However, in many cases, this probability distribution has the correct class at a very high probability, with all other class probabilities very close to 0. As such, it doesn't provide much information beyond the ground truth labels already provided in the dataset. To tackle this issue, Hinton et al., 2015 introduced the concept of \"softmax temperature\". The probability p_i of class i is calculated from the logits z as: p_i = \\frac{exp\\left(\\frac{z_i}{T}\\right)}{\\sum_{j} \\exp\\left(\\frac{z_j}{T}\\right)} where T is the temperature parameter. When T=1 we get the standard softmax function. As T grows, the probability distribution generated by the softmax function becomes softer, providing more information as to which classes the teacher found more similar to the predicted class. Hinton calls this the \"dark knowledge\" embedded in the teacher model, and it is this dark knowledge that we are transferring to the student model in the distillation process. When computing the loss function vs. the teacher's soft targets, we use the same value of T to compute the softmax on the student's logits. We call this loss the \"distillation loss\". Hinton et al., 2015 found that it is also beneficial to train the distilled model to produce the correct labels (based on the ground truth) in addition to the teacher's soft-labels. Hence, we also calculate the \"standard\" loss between the student's predicted class probabilities and the ground-truth labels (also called \"hard labels/targets\"). We dub this loss the \"student loss\". When calculating the class probabilities for the student loss we use T = 1 . The overall loss function, incorporating both distillation and student losses, is calculated as: \\mathcal{L}(x;W) = \\alpha * \\mathcal{H}(y, \\sigma(z_s; T=1)) + \\beta * \\mathcal{H}(\\sigma(z_t; T=\\tau), \\sigma(z_s, T=\\tau)) where x is the input, W are the student model parameters, y is the ground truth label, \\mathcal{H} is the cross-entropy loss function, \\sigma is the softmax function parameterized by the temperature T , and \\alpha and \\beta are coefficients. z_s and z_t are the logits of the student and teacher respectively.", + "title": "Knowledge Distillation" + }, + { + "location": "/knowledge_distillation/index.html#new-hyper-parameters", + "text": "In general \\tau , \\alpha and \\beta are hyper parameters. In their experiments, Hinton et al., 2015 use temperature values ranging from 1 to 20. They note that empirically, when the student model is very small compared to the teacher model, lower temperatures work better. This makes sense if we consider that as we raise the temperature, the resulting soft-labels distribution becomes richer in information, and a very small model might not be able to capture all of this information. However, there's no clear way to predict up front what kind of capacity for information the student model will have. With regards to \\alpha and \\beta , Hinton et al., 2015 use a weighted average between the distillation loss and the student loss. That is, \\beta = 1 - \\alpha . They note that in general, they obtained the best results when setting \\alpha to be much smaller than \\beta (although in one of their experiments they use \\alpha = \\beta = 0.5 ). Other works which utilize knowledge distillation don't use a weighted average. Some set \\alpha = 1 while leaving \\beta tunable, while others don't set any constraints.", + "title": "New Hyper-Parameters" + }, + { + "location": "/knowledge_distillation/index.html#references", + "text": "Cristian Bucila, Rich Caruana, and Alexandru Niculescu-Mizil . Model Compression. KDD, 2006 Geoffrey Hinton, Oriol Vinyals and Jeff Dean . Distilling the Knowledge in a Neural Network. arxiv:1503.02531 Hokchhay Tann, Soheil Hashemi, Iris Bahar and Sherief Reda . Hardware-Software Codesign of Accurate, Multiplier-free Deep Neural Networks. DAC, 2017 Asit Mishra and Debbie Marr . Apprentice: Using Knowledge Distillation Techniques To Improve Low-Precision Network Accuracy. ICLR, 2018 Antonio Polino, Razvan Pascanu and Dan Alistarh . Model compression via distillation and quantization. ICLR, 2018 Anubhav Ashok, Nicholas Rhinehart, Fares Beainy and Kris M. Kitani . N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning. ICLR, 2018 Lucas Theis, Iryna Korshunova, Alykhan Tejani and Ferenc Husz\u00e1r . Faster gaze prediction with dense networks and Fisher pruning. arxiv:1801.05787", "title": "References" }, { diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 84165849ca515ac50dfdeb5582f468852bbf2e03..209af070d2774a7cbf4ed3ae812118e93cfd5182 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ <url> <loc>/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -12,7 +12,7 @@ <url> <loc>/install/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -20,7 +20,7 @@ <url> <loc>/usage/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -28,7 +28,7 @@ <url> <loc>/schedule/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -37,19 +37,25 @@ <url> <loc>/pruning/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> <url> <loc>/regularization/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> <url> <loc>/quantization/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> + <changefreq>daily</changefreq> + </url> + + <url> + <loc>/knowledge_distillation/index.html</loc> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -59,13 +65,13 @@ <url> <loc>/algo_pruning/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> <url> <loc>/algo_quantization/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -74,7 +80,7 @@ <url> <loc>/model_zoo/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -82,7 +88,7 @@ <url> <loc>/jupyter/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> @@ -90,7 +96,7 @@ <url> <loc>/design/index.html</loc> - <lastmod>2018-07-22</lastmod> + <lastmod>2018-09-04</lastmod> <changefreq>daily</changefreq> </url> diff --git a/docs/usage/index.html b/docs/usage/index.html index 87a0ad604b0559ed4dbecca594835db088e27e87..edb0b610a47cc584df7218073cc7028e5358a10d 100644 --- a/docs/usage/index.html +++ b/docs/usage/index.html @@ -116,6 +116,10 @@ <a class="" href="../quantization/index.html">Quantization</a> </li> + <li class=""> + + <a class="" href="../knowledge_distillation/index.html">Knowledge Distillation</a> + </li> </ul> </li> diff --git a/examples/classifier_compression/compress_classifier.py b/examples/classifier_compression/compress_classifier.py index 4ffbba3d26c38c86e5a89e25c9605f2e30eaed86..e95d92f27ecb65f7912963007c2cb76e58eb7311 100755 --- a/examples/classifier_compression/compress_classifier.py +++ b/examples/classifier_compression/compress_classifier.py @@ -151,6 +151,7 @@ parser.add_argument('--earlyexit_lossweights', type=float, nargs='*', dest='earl parser.add_argument('--earlyexit_thresholds', type=float, nargs='*', dest='earlyexit_thresholds', default=None, help='List of EarlyExit thresholds (e.g. --earlyexit 1.2 0.9)') +distiller.knowledge_distillation.add_distillation_args(parser, ALL_MODEL_NAMES, True) def check_pytorch_version(): if torch.__version__ < '0.4.0': @@ -180,6 +181,7 @@ def main(): start_epoch = 0 best_top1 = 0 + best_epoch = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this @@ -284,6 +286,25 @@ def main(): compression_scheduler = distiller.file_config(model, optimizer, args.compress) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.cuda() + else: + compression_scheduler = distiller.CompressionScheduler(model) + + args.kd_policy = None + if args.kd_teacher: + teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) + if args.kd_resume: + teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) + dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) + args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) + compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, + frequency=1) + + msglogger.info('\nStudent-Teacher knowledge distillation enabled:') + msglogger.info('\tTeacher Model: %s', args.kd_teacher) + msglogger.info('\tTemperature: %s', args.kd_temp) + msglogger.info('\tLoss Weights (distillation | student | teacher): %s', + ' | '.join(['{:.2f}'.format(val) for val in dlw])) + msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. @@ -316,7 +337,7 @@ def main(): if is_best: best_epoch = epoch best_top1 = top1 - msglogger.info('==> Best validation Top1: %.3f Epoch: %d', best_top1, best_epoch) + msglogger.info('==> Best Top1: %.3f On Epoch: %d\n', best_top1, best_epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name, msglogger.logdir) @@ -366,7 +387,11 @@ def train(train_loader, model, criterion, optimizer, epoch, if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) - output = model(input_var) + if args.kd_policy is None: + output = model(input_var) + else: + output = args.kd_policy.forward(input_var) + if not args.earlyexit_lossweights: loss = criterion(output, target_var) # Measure accuracy and record loss @@ -499,9 +524,9 @@ def _validate(data_loader, model, criterion, loggers, args, epoch=-1): if steps_completed % args.print_freq == 0: if not args.earlyexit_thresholds: stats = ('', - OrderedDict([('Loss', losses['objective_loss'].mean), - ('Top1', classerr.value(1)), - ('Top5', classerr.value(5))])) + OrderedDict([('Loss', losses['objective_loss'].mean), + ('Top1', classerr.value(1)), + ('Top5', classerr.value(5))])) else: stats_dict = OrderedDict() stats_dict['Test'] = validation_step @@ -525,7 +550,7 @@ def _validate(data_loader, model, criterion, loggers, args, epoch=-1): classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) if args.display_confusion: - msglogger.info('==> Confusion:\n%s', str(confusion.value())) + msglogger.info('==> Confusion:\n%s\n', str(confusion.value())) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean else: # Print some interesting summary stats for number of data points that could exit early diff --git a/models/__init__.py b/models/__init__.py index b34d7a29399cc6f08a649ead808f8a4624e79d24..91f8cec936dd8e634b881dd2650a6ab1fec4fdee 100755 --- a/models/__init__.py +++ b/models/__init__.py @@ -54,15 +54,17 @@ def create_model(pretrained, dataset, arch, parallel=True, device_ids=None): if dataset == 'imagenet': str_pretrained = 'pretrained ' if pretrained else '' msglogger.info("=> using %s%s model for ImageNet" % (str_pretrained, arch)) - assert arch in torch_models.__dict__ or arch in imagenet_extra_models.__dict__, "Model %s is not supported for dataset %s" % (arch, 'ImageNet') + assert arch in torch_models.__dict__ or arch in imagenet_extra_models.__dict__, \ + "Model %s is not supported for dataset %s" % (arch, 'ImageNet') if arch in torch_models.__dict__: model = torch_models.__dict__[arch](pretrained=pretrained) else: + assert not pretrained, "Model %s (ImageNet) does not have a pretrained model" % arch model = imagenet_extra_models.__dict__[arch]() elif dataset == 'cifar10': msglogger.info("=> creating %s model for CIFAR10" % arch) - assert not pretrained, "Model %s (CIFAR10) does not have a pretrained model" % arch assert arch in cifar10_models.__dict__, "Model %s is not supported for dataset CIFAR10" % arch + assert not pretrained, "Model %s (CIFAR10) does not have a pretrained model" % arch model = cifar10_models.__dict__[arch]() else: print("FATAL ERROR: create_model does not support models for dataset %s" % dataset) diff --git a/models/cifar10/preresnet_cifar.py b/models/cifar10/preresnet_cifar.py index f8aa50c3f53ac3191e317101246385aa5cfff313..9a8b2e9e884e70ca462e1a24e25f3d85a23ccf44 100644 --- a/models/cifar10/preresnet_cifar.py +++ b/models/cifar10/preresnet_cifar.py @@ -188,6 +188,11 @@ def preact_resnet110_cifar(**kwargs): return model +def preact_resnet182_cifar(**kwargs): + model = PreactResNetCifar(PreactBasicBlock, [30, 30, 30], **kwargs) + return model + + def preact_resnet20_cifar_conv_ds(**kwargs): return preact_resnet20_cifar(conv_downsample=True) @@ -206,3 +211,7 @@ def preact_resnet56_cifar_conv_ds(**kwargs): def preact_resnet110_cifar_conv_ds(**kwargs): return preact_resnet110_cifar(conv_downsample=True) + + +def preact_resnet182_cifar_conv_ds(**kwargs): + return preact_resnet182_cifar(conv_downsample=True)