diff --git a/apputils/model_summaries.py b/apputils/model_summaries.py index 3d0cedc5293242882126fe6dbff0163d509a5cd5..6c2cda35e3f45b720eb8ffaa33e4354354a5e1b2 100755 --- a/apputils/model_summaries.py +++ b/apputils/model_summaries.py @@ -606,30 +606,26 @@ def dataset_dummy_input(dataset): return dummy_input -def export_img_classifier_to_onnx(model, onnx_fname, dataset): +def export_img_classifier_to_onnx(model, onnx_fname, dataset, export_params=True, add_softmax=True): """Export a PyTorch image classifier to ONNX. """ dummy_input = dataset_dummy_input(dataset).to('cuda') + # Pytorch 0.4 doesn't support exporting modules wrapped in DataParallel + model = distiller.make_non_parallel_copy(model) with torch.onnx.set_training(model, False): - # Pytorch 0.4 doesn't support exporting modules wrapped in DataParallel - if isinstance(model, torch.nn.DataParallel): - model = model.module - - # Explicitly add a softmax layer, because it is needed for the ONNX inference phase. - # We make a copy of the model, since we are about to change it (adding softmax). - model = deepcopy(model) - model.original_forward = model.forward - softmax = torch.nn.Softmax(dim=1) - model.forward = lambda input: softmax(model.original_forward(input)) - - torch.onnx.export(model, dummy_input, onnx_fname, verbose=False, export_params=True) + if add_softmax: + # Explicitly add a softmax layer, because it is needed for the ONNX inference phase. + model.original_forward = model.forward + softmax = torch.nn.Softmax(dim=1) + model.forward = lambda input: softmax(model.original_forward(input)) + torch.onnx.export(model, dummy_input, onnx_fname, verbose=False, export_params=export_params) msglogger.info('Exported the model to ONNX format at %s' % os.path.realpath(onnx_fname)) - def data_node_has_parent(g, id): for edge in g.edges: - if edge.dst == id: return True + if edge.dst == id: + return True return False diff --git a/distiller/utils.py b/distiller/utils.py index 13d1f4aad0c2431339a0d2caa7370479ca48a00e..a193dfcdda35a9139f5c7a8b9d15140134debe84 100755 --- a/distiller/utils.py +++ b/distiller/utils.py @@ -50,6 +50,38 @@ def pretty_int(i): return "{:,}".format(i) +def assign_layer_fq_names(container, name=None): + """Assign human-readable names to the modules (layers). + + Sometimes we need to access modules by their names, and we'd like to use + fully-qualified names for convinience. + """ + is_leaf = True + for key, module in container._modules.items(): + is_leaf = False + assign_layer_fq_names(module, ".".join([name, key]) if name is not None else key) + if is_leaf: + container.distiller_name = name + + +def find_module_by_fq_name(model, fq_mod_name): + """Given a module's fully-qualified name, find the module in the provided model. + + A fully-qualified name is assigned to modules in function assign_layer_fq_names. + + Arguments: + model: the model to search + fq_mod_name: the module whose name we want to look up + + Returns: + The module or None, if the module was not found. + """ + for module in model.modules(): + if hasattr(module, 'distiller_name') and fq_mod_name == module.distiller_name: + return module + return None + + def normalize_module_name(layer_name): """Normalize a module's name. @@ -94,7 +126,6 @@ def density(tensor): Returns: density (float) """ - assert torch.numel(tensor) > 0 nonzero = torch.nonzero(tensor) if nonzero.dim() == 0: return 0.0 @@ -193,34 +224,60 @@ def density_ch(tensor): return 1 - sparsity_ch(tensor) -def sparsity_cols(tensor): - """Column-wise sparsity for 2D tensors""" +def sparsity_matrix(tensor, dim): + """Generic sparsity computation for 2D matrices""" if tensor.dim() != 2: return 0 - num_cols = tensor.size()[1] - nonzero_cols = len(torch.nonzero(tensor.abs().sum(dim=0))) - return 1 - nonzero_cols/num_cols + num_structs = tensor.size()[dim] + nonzero_structs = len(torch.nonzero(tensor.abs().sum(dim=1-dim))) + return 1 - nonzero_structs/num_structs -def density_cols(tensor): +def sparsity_cols(tensor, trasposed=True): + """Column-wise sparsity for 2D tensors + + PyTorch GEMM matrices are transposed before they are used in the GEMM operation. + In other words the matrices are stored in memory transposed. So by default we compute + the sparsity of the transposed dimension. + """ + if trasposed: + return sparsity_matrix(tensor, 0) + return sparsity_matrix(tensor, 1) + + +def density_cols(tensor, transposed=True): """Column-wise density for 2D tensors""" - return 1 - sparsity_cols(tensor) + return 1 - sparsity_cols(tensor, transposed) -def sparsity_rows(tensor): - """Row-wise sparsity for 2D matrices""" - if tensor.dim() != 2: - return 0 +def sparsity_rows(tensor, trasposed=True): + """Row-wise sparsity for 2D matrices - num_rows = tensor.size()[0] - nonzero_rows = len(torch.nonzero(tensor.abs().sum(dim=1))) - return 1 - nonzero_rows/num_rows + PyTorch GEMM matrices are transposed before they are used in the GEMM operation. + In other words the matrices are stored in memory transposed. So by default we compute + the sparsity of the transposed dimension. + """ + if trasposed: + return sparsity_matrix(tensor, 1) + return sparsity_matrix(tensor, 0) -def density_rows(tensor): +def density_rows(tensor, transposed=True): """Row-wise density for 2D tensors""" - return 1 - sparsity_rows(tensor) + return 1 - sparsity_rows(tensor, transposed) + + +def norm_filters(weights, p=1): + """Compute the p-norm of convolution filters. + + Args: + weights - a 4D convolution weights tensor. + Has shape = (#filters, #channels, k_w, k_h) + p - the exponent value in the norm formulation + """ + assert weights.dim() == 4 + return weights.view(weights.size(0), -1).norm(p=p, dim=1) def model_numel(model, param_dims=[2, 4]): @@ -233,6 +290,70 @@ def model_numel(model, param_dims=[2, 4]): return total_numel +def activation_channels_l1(activation): + """Calculate the L1-norms of an activation's channels. + + The activation usually has the shape: (batch_size, num_channels, h, w). + + When the activations are computed on a distributed GPU system, different parts of the + activation tensor might be computed by a differnt GPU. If this function is called from + the forward-callback of some activation module in the graph, we will only witness part + of the batch. For example, if the batch_size is 256, and we are using 4 GPUS, instead + of seeing activations with shape = (256, num_channels, h, w), we may see 4 calls with + shape = (64, num_channels, h, w). + + Since we want to calculate the average of the L1-norm of each of the channels of the + activation, we need to move the partial sums results to the CPU, where they will be + added together. + + Returns - for each channel: the batch-mean of its L1 magnitudes (i.e. over all of the + activations in the mini-batch, compute the mean of the L! magnitude of each channel). + """ + view_2d = activation.view(-1, activation.size(2) * activation.size(3)) # (batch*channel) x (h*w) + featuremap_norms = view_2d.norm(p=1, dim=1) + featuremap_norms_mat = featuremap_norms.view(activation.size(0), activation.size(1)) # batch x channel + # We need to move the results back to the CPU + return featuremap_norms_mat.mean(dim=0).cpu() + + +def activation_channels_means(activation): + """Calculate the mean of each of an activation's channels. + + The activation usually has the shape: (batch_size, num_channels, h, w). + + "We first use global average pooling to convert the output of layer i, which is a + c x h x w tensor, into a 1 x c vector." + + Returns - for each channel: the batch-mean of its L1 magnitudes (i.e. over all of the + activations in the mini-batch, compute the mean of the L1 magnitude of each channel). + """ + view_2d = activation.view(-1, activation.size(2) * activation.size(3)) # (batch*channel) x (h*w) + featuremap_means = sparsity_rows(view_2d) + featuremap_means_mat = featuremap_means.view(activation.size(0), activation.size(1)) # batch x channel + # We need to move the results back to the CPU + return featuremap_means_mat.mean(dim=0).cpu() + + +def activation_channels_apoz(activation): + """Calculate the APoZ of each of an activation's channels. + + APoZ is the Average Percentage of Zeros (or simply: average sparsity) and is defined in: + "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures". + + The activation usually has the shape: (batch_size, num_channels, h, w). + + "We first use global average pooling to convert the output of layer i, which is a + c x h x w tensor, into a 1 x c vector." + + Returns - for each channel: the batch-mean of its sparsity. + """ + view_2d = activation.view(-1, activation.size(2) * activation.size(3)) # (batch*channel) x (h*w) + featuremap_means = view_2d.mean(dim=1) # global average pooling + featuremap_means_mat = featuremap_means.view(activation.size(0), activation.size(1)) # batch x channel + # We need to move the results back to the CPU + return featuremap_means_mat.mean(dim=0).cpu() + + def log_training_progress(stats_dict, params_dict, epoch, steps_completed, total_steps, log_freq, loggers): """Log information about the training progress, and the distribution of the weight tensors. @@ -258,10 +379,12 @@ def log_training_progress(stats_dict, params_dict, epoch, steps_completed, total logger.log_weights_distribution(params_dict, steps_completed) -def log_activation_sparsity(epoch, loggers, collector): +def log_activation_statsitics(epoch, phase, loggers, collector): """Log information about the sparsity of the activations""" + if collector is None: + return for logger in loggers: - logger.log_activation_sparsity(collector.value(), epoch) + logger.log_activation_statsitic(phase, collector.stat_name, collector.value(), epoch) def log_weights_sparsity(model, epoch, loggers): @@ -298,24 +421,19 @@ class DoNothingModuleWrapper(nn.Module): def make_non_parallel_copy(model): """Make a non-data-parallel copy of the provided model. - nn.DataParallel instances are replaced by DoNothingModuleWrapper - instances. + torch.nn.DataParallel instances are removed. """ - def replace_data_parallel(container, prefix=''): + def replace_data_parallel(container): for name, module in container.named_children(): - full_name = prefix + name if isinstance(module, nn.DataParallel): - # msglogger.debug('Replacing module {}'.format(full_name)) - setattr(container, name, DoNothingModuleWrapper(module.module)) + setattr(container, name, module.module) if has_children(module): - # For a container we call recursively - replace_data_parallel(module, full_name + '.') + replace_data_parallel(module) # Make a copy of the model, because we're going to change it new_model = deepcopy(model) if isinstance(new_model, nn.DataParallel): - # new_model = new_model.module # - new_model = DoNothingModuleWrapper(new_model.module) - + new_model = new_model.module replace_data_parallel(new_model) + return new_model