转自:https://blog.csdn.net/u014114990/article/details/90665141
1. 为什么要合并BN层
在训练深度网络模型时,BN(Batch Normalization)层能够加速网络收敛,并且能够控制过拟合,一般放在卷积层之后。BN 层将数据归一化后,能够有效解决梯度消失与梯度爆炸问题。虽然 BN 层在训练时起到了积极作用,然而,在网络前向推断时多了一些层的运算,影响了模型的性能,且占用了更多的内存或者显存空间。目前,很多先进的网络模型(ResNet,MobileNet,Xception,ShuffleNet 等)都使用了BN技术,因此,我们有必要将 BN 层的参数合并到卷积层,来提升模型前向推断的速度。
2. BN层与卷积层合并的数学原理
卷积层中
3. 实验结果
机器:显卡 GTX 1080Ti,i7 CPU
本实验对比了Resnet50 模型合并BN层前后的性能,分类精度保持不变,速度显著提升。
模型 | CPU前向时间 | GPU前向时间 |
Resnet50(合并前) | 176.17ms | 11.03ms |
Resnet50(合并后) | 161.69ms | 7.3ms |
提升 | 10% | 51% |
4. 合并的python脚本
该脚本需要caffe的python接口
-
#!/usr/bin/env python
-
# -*- coding: UTF-8 -*-
-
import numpy as np
-
import sys
-
import os
-
import os.path as osp
-
import google.protobuf as pb
-
import google.protobuf.text_format
-
from argparse import ArgumentParser
-
import caffe
-
caffe.set_mode_cpu()
-
def load_and_fill_biases(src_model, src_weights, dst_model, dst_weights):
-
with open(src_model) as f:
-
model = caffe.proto.caffe_pb2.NetParameter()
-
pb.text_format.Merge(f.read(), model)
-
for i, layer in enumerate(model.layer):
-
if layer.type == 'Convolution': # or layer.type == 'Scale':
-
# Add bias layer if needed
-
if layer.convolution_param.bias_term == False:
-
layer.convolution_param.bias_term = True
-
layer.convolution_param.bias_filler.type = 'constant'
-
layer.convolution_param.bias_filler.value = 0.0
-
with open(dst_model, 'w') as f:
-
f.write(pb.text_format.MessageToString(model))
-
caffe.set_mode_cpu()
-
net_src = caffe.Net(src_model, src_weights, caffe.TEST)
-
net_dst = caffe.Net(dst_model, caffe.TEST)
-
for key in net_src.params.keys():
-
for i in range(len(net_src.params[key])):
-
net_dst.params[key][i].data[:] = net_src.params[key][i].data[:]
-
if dst_weights is not None:
-
# Store params
-
pass
-
return net_dst
-
def merge_conv_and_bn(net, i_conv, i_bn, i_scale):
-
# This is based on Kyeheyon's work
-
assert(i_conv != None)
-
assert(i_bn != None)
-
def copy_double(data):
-
return np.array(data, copy=True, dtype=np.double)
-
key_conv = net._layer_names[i_conv]
-
key_bn = net._layer_names[i_bn]
-
key_scale = net._layer_names[i_scale] if i_scale else None
-
# Copy
-
bn_mean = copy_double(net.params[key_bn][0].data)
-
bn_variance = copy_double(net.params[key_bn][1].data)
-
num_bn_samples = copy_double(net.params[key_bn][2].data)
-
# and Invalidate the BN layer
-
net.params[key_bn][0].data[:] = 0
-
net.params[key_bn][1].data[:] = 1
-
net.params[key_bn][2].data[:] = 1
-
if num_bn_samples[0] == 0:
-
num_bn_samples[0] = 1
-
if net.params.has_key(key_scale):
-
print 'Combine {:s} + {:s} + {:s}'.format(key_conv, key_bn, key_scale)
-
scale_weight = copy_double(net.params[key_scale][0].data)
-
scale_bias = copy_double(net.params[key_scale][1].data)
-
net.params[key_scale][0].data[:] = 1
-
net.params[key_scale][1].data[:] = 0
-
else:
-
print 'Combine {:s} + {:s}'.format(key_conv, key_bn)
-
scale_weight = 1
-
scale_bias = 0
-
weight = copy_double(net.params[key_conv][0].data)
-
bias = copy_double(net.params[key_conv][1].data)
-
alpha = scale_weight / np.sqrt(bn_variance / num_bn_samples[0] + 1e-5)
-
net.params[key_conv][1].data[:] = bias * alpha + (scale_bias - (bn_mean / num_bn_samples[0]) * alpha)
-
for i in range(len(alpha)):
-
net.params[key_conv][0].data[i] = weight[i] * alpha[i]
-
def merge_batchnorms_in_net(net):
-
# for each BN
-
for i, layer in enumerate(net.layers):
-
if layer.type != 'BatchNorm':
-
continue
-
l_name = net._layer_names[i]
-
l_bottom = net.bottom_names[l_name]
-
assert(len(l_bottom) == 1)
-
l_bottom = l_bottom[0]
-
l_top = net.top_names[l_name]
-
assert(len(l_top) == 1)
-
l_top = l_top[0]
-
can_be_absorbed = True
-
# Search all (bottom) layers
-
for j in xrange(i - 1, -1, -1):
-
tops_of_j = net.top_names[net._layer_names[j]]
-
if l_bottom in tops_of_j:
-
if net.layers[j].type not in ['Convolution', 'InnerProduct']:
-
can_be_absorbed = False
-
else:
-
# There must be only one layer
-
conv_ind = j
-
break
-
if not can_be_absorbed:
-
continue
-
# find the following Scale
-
scale_ind = None
-
for j in xrange(i + 1, len(net.layers)):
-
bottoms_of_j = net.bottom_names[net._layer_names[j]]
-
if l_top in bottoms_of_j:
-
if scale_ind:
-
# Followed by two or more layers
-
scale_ind = None
-
break
-
if net.layers[j].type in ['Scale']:
-
scale_ind = j
-
top_of_j = net.top_names[net._layer_names[j]][0]
-
if top_of_j == bottoms_of_j[0]:
-
# On-the-fly => Can be merged
-
break
-
else:
-
# Followed by a layer which is not 'Scale'
-
scale_ind = None
-
break
-
merge_conv_and_bn(net, conv_ind, i, scale_ind)
-
return net
-
def process_model(net, src_model, dst_model, func_loop, func_finally):
-
with open(src_model) as f:
-
model = caffe.proto.caffe_pb2.NetParameter()
-
pb.text_format.Merge(f.read(), model)
-
for i, layer in enumerate(model.layer):
-
map(lambda x: x(layer, net, model, i), func_loop)
-
map(lambda x: x(net, model), func_finally)
-
with open(dst_model, 'w') as f:
-
f.write(pb.text_format.MessageToString(model))
-
# Functions to remove (redundant) BN and Scale layers
-
to_delete_empty = []
-
def pick_empty_layers(layer, net, model, i):
-
if layer.type not in ['BatchNorm', 'Scale']:
-
return
-
bottom = layer.bottom[0]
-
top = layer.top[0]
-
if (bottom != top):
-
# Not supperted yet
-
return
-
if layer.type == 'BatchNorm':
-
zero_mean = np.all(net.params[layer.name][0].data == 0)
-
one_var = np.all(net.params[layer.name][1].data == 1)
-
if zero_mean and one_var:
-
print 'Delete layer: {}'.format(layer.name)
-
to_delete_empty.append(layer)
-
if layer.type == 'Scale':
-
no_scaling = np.all(net.params[layer.name][0].data == 1)
-
zero_bias = np.all(net.params[layer.name][1].data == 0)
-
if no_scaling and zero_bias:
-
print 'Delete layer: {}'.format(layer.name)
-
to_delete_empty.append(layer)
-
def remove_empty_layers(net, model):
-
map(model.layer.remove, to_delete_empty)
-
# A function to add 'engine: CAFFE' param into 1x1 convolutions
-
def set_engine_caffe(layer, net, model, i):
-
if layer.type == 'Convolution':
-
if layer.convolution_param.kernel_size == 1\
-
or (layer.convolution_param.kernel_h == layer.convolution_param.kernel_w == 1):
-
layer.convolution_param.engine = dict(layer.convolution_param.Engine.items())['CAFFE']
-
def main():
-
# Set default output file names
-
if args.output_model is None:
-
file_name = osp.splitext(args.model)[0]
-
args.output_model = file_name + '_inference.prototxt'
-
if args.output_weights is None:
-
file_name = osp.splitext(args.weights)[0]
-
args.output_weights = file_name + '_inference.caffemodel'
-
net = load_and_fill_biases(args.model, args.weights, args.model + '.temp.pt', None)
-
net = merge_batchnorms_in_net(net)
-
process_model(net, args.model + '.temp.pt', args.output_model,
-
[pick_empty_layers, set_engine_caffe],
-
[remove_empty_layers])
-
# Store params
-
net.save(args.output_weights)
-
if __name__ == '__main__':
-
parser = ArgumentParser(
-
description="Generate Batch Normalized model for inference")
-
parser.add_argument('--model', default="MobileNetSSD_deploy.prototxt", help="The net definition prototxt")
-
parser.add_argument('--weights', default="MobileNetSSD_deploy.caffemodel", help="The weights caffemodel")
-
parser.add_argument('--output_model')
-
parser.add_argument('--output_weights')
-
args = parser.parse_args()
-
main()
脚本下载地址:
https://download.csdn.net/download/kangdi7547/10578152
转载:https://blog.csdn.net/haima1998/article/details/101386218