項目實戰 DeepLabV1,V2,V3 Google三大語義分割演算法源碼解析
- 2019 年 12 月 9 日
- 筆記
前言
演算法和工程是演算法工程師不可缺少的兩種能力,之前我介紹了DeepLab V1,V2, V3,但總是感覺少了點什麼?只有Paper,沒有源碼那不相當於是紙上談兵了,所以今天嘗試結合論文的源碼來進行仔細的分析這三個演算法。等我們分析清楚這三個演算法之後,有機會再解析一下DeepLabV3。由於最近正在看Pytorch版本的《動手學深度學習》,不妨用Pytorch的源碼來進行分析。我分析的源碼均來自這個Pytorch工程:https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models
DeepLab V1源碼分析
DeepLab V1的演算法原理可以看我之前的推文,地址是:https://mp.weixin.qq.com/s/rvP8-Y-CRuq4HFzR0qJWcg 。我們今天解析的DeepLab系列網路模型是在ResNet殘差模組的基礎上配合空洞卷積實現的,對於DeepLab V1, 第一層為普通卷積,stride = 2,緊跟著 stride = 2 的 max-pooling,然後一個普通的 bottleneck ,一個 stride = 2 的 bottleneck,然後 dilation =2、dilation =4 的bottleneck。
from __future__ import absolute_import, print_function import torch import torch.nn as nn import torch.nn.functional as F # 定義DeepLabV1的網路結構 class DeepLabV1(nn.Sequential): """ DeepLab v1: Dilated ResNet + 1x1 Conv Note that this is just a container for loading the pretrained COCO model and not mentioned as "v1" in papers. """ def __init__(self, n_classes, n_blocks): super(DeepLabV1, self).__init__() ch = [64 * 2 ** p for p in range(6)] self.add_module("layer1", _Stem(ch[0])) self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1)) self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1)) self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2)) self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4)) self.add_module("fc", nn.Conv2d(2048, n_classes, 1)) # 這裡是看一下是使用torch的nn模組中BatchNorm還是在encoding文件中定義的BatchNorm try: from encoding.nn import SyncBatchNorm _BATCH_NORM = SyncBatchNorm except: _BATCH_NORM = nn.BatchNorm2d _BOTTLENECK_EXPANSION = 4 # 定義卷積+BN+ReLU的組件 class _ConvBnReLU(nn.Sequential): """ Cascade of 2D convolution, batch norm, and ReLU. """ BATCH_NORM = _BATCH_NORM def __init__( self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True ): super(_ConvBnReLU, self).__init__() self.add_module( "conv", nn.Conv2d( in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False ), ) self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999)) if relu: self.add_module("relu", nn.ReLU()) # 定義Bottleneck,先1*1卷積降維,然後使用3*3卷積,最後再1*1卷積升維,然後再shortcut連接。 # 降維到多少是由_BOTTLENECK_EXPANSION參數決定的,這是ResNet的Bottleneck。 class _Bottleneck(nn.Module): """ Bottleneck block of MSRA ResNet. """ def __init__(self, in_ch, out_ch, stride, dilation, downsample): super(_Bottleneck, self).__init__() mid_ch = out_ch // _BOTTLENECK_EXPANSION self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True) self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True) self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False) self.shortcut = ( _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False) if downsample else lambda x: x # identity ) def forward(self, x): h = self.reduce(x) h = self.conv3x3(h) h = self.increase(h) h += self.shortcut(x) return F.relu(h) # 定義ResLayer,整個DeepLabv1是用ResLayer堆疊起來的,下取樣是在每個ResLayer的第一個 # Bottleneck發生的。 class _ResLayer(nn.Sequential): """ Residual layer with multi grids """ def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None): super(_ResLayer, self).__init__() if multi_grids is None: multi_grids = [1 for _ in range(n_layers)] else: assert n_layers == len(multi_grids) # Downsampling is only in the first block for i in range(n_layers): self.add_module( "block{}".format(i + 1), _Bottleneck( in_ch=(in_ch if i == 0 else out_ch), out_ch=out_ch, stride=(stride if i == 0 else 1), dilation=dilation * multi_grids[i], downsample=(True if i == 0 else False), ), ) # 在進入ResLayer之前,先用7*7的卷積核在原圖滑動,增大感受野。padding方式設為same,大小不變。 # Pool層的核大小為3,步長為2,這會導致特徵圖的解析度發生變化。 class _Stem(nn.Sequential): """ The 1st conv layer. Note that the max pooling is different from both MSRA and FAIR ResNet. """ def __init__(self, out_ch): super(_Stem, self).__init__() self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1)) self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True)) # 相當於Reshape,網路並沒有用到 class _Flatten(nn.Module): def forward(self, x): return x.view(x.size(0), -1) # 主函數,輸出構建的DeepLab V1模型的結構還有原始影像解析度和結果影像的解析度 if __name__ == "__main__": model = DeepLabV1(n_classes=21, n_blocks=[3, 4, 23, 3]) #model.eval() image = torch.randn(1, 3, 513, 513) print(model) print("input:", image.shape) print("output:", model(image).shape)
我們看一下網路的輸入和輸出特徵圖尺寸:
input: torch.Size([1, 3, 513, 513]) output: torch.Size([1, 21, 65, 65])
網路結構已經非常清晰了,可以直接運行Python程式碼列印出網路結構或者按照我的源碼注釋來理解。注意,訓練的時候ground truth要resize到和模型的輸出特徵圖尺寸一樣大才可以。
DeepLab V2源碼分析
DeepLab V2的論文解讀請看我前面發的文章:https://mp.weixin.qq.com/s/ylv3QfOe_BOuVuxQTd_m_g 。簡單的說,DeepLab V2就是DeepLab V1的基礎上加了一個ASPP模組,這是一個類似於Inception模組的結構,包含不同膨脹係數的空洞卷積,增強模型識別同一物體的多尺度能力。這裡仍然只分析源碼:為了方便理解把上篇文章中的ASPP模組的示意圖放在這裡:

在這裡插入圖片描述
from __future__ import absolute_import, print_function import torch import torch.nn as nn import torch.nn.functional as F # 定義ASPP模組,這是DeepLab V2和V1的主要區別,可以看到其他部分和V1的程式碼一模一樣 class _ASPP(nn.Module): """ Atrous spatial pyramid pooling (ASPP) """ def __init__(self, in_ch, out_ch, rates): super(_ASPP, self).__init__() for i, rate in enumerate(rates): self.add_module( "c{}".format(i), nn.Conv2d(in_ch, out_ch, 3, 1, padding=rate, dilation=rate, bias=True), ) for m in self.children(): nn.init.normal_(m.weight, mean=0, std=0.01) nn.init.constant_(m.bias, 0) def forward(self, x): return sum([stage(x) for stage in self.children()]) class DeepLabV2(nn.Sequential): """ DeepLab v2: Dilated ResNet + ASPP Output stride is fixed at 8 """ def __init__(self, n_classes, n_blocks, atrous_rates): super(DeepLabV2, self).__init__() ch = [64 * 2 ** p for p in range(6)] self.add_module("layer1", _Stem(ch[0])) self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1)) self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1)) self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2)) self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4)) self.add_module("aspp", _ASPP(ch[5], n_classes, atrous_rates)) def freeze_bn(self): for m in self.modules(): if isinstance(m, _ConvBnReLU.BATCH_NORM): m.eval() try: from encoding.nn import SyncBatchNorm _BATCH_NORM = SyncBatchNorm except: _BATCH_NORM = nn.BatchNorm2d _BOTTLENECK_EXPANSION = 4 class _ConvBnReLU(nn.Sequential): """ Cascade of 2D convolution, batch norm, and ReLU. """ BATCH_NORM = _BATCH_NORM def __init__( self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True ): super(_ConvBnReLU, self).__init__() self.add_module( "conv", nn.Conv2d( in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False ), ) self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999)) if relu: self.add_module("relu", nn.ReLU()) class _Bottleneck(nn.Module): """ Bottleneck block of MSRA ResNet. """ def __init__(self, in_ch, out_ch, stride, dilation, downsample): super(_Bottleneck, self).__init__() mid_ch = out_ch // _BOTTLENECK_EXPANSION self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True) self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True) self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False) self.shortcut = ( _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False) if downsample else lambda x: x # identity ) def forward(self, x): h = self.reduce(x) h = self.conv3x3(h) h = self.increase(h) h += self.shortcut(x) return F.relu(h) class _ResLayer(nn.Sequential): """ Residual layer with multi grids """ def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None): super(_ResLayer, self).__init__() if multi_grids is None: multi_grids = [1 for _ in range(n_layers)] else: assert n_layers == len(multi_grids) # Downsampling is only in the first block for i in range(n_layers): self.add_module( "block{}".format(i + 1), _Bottleneck( in_ch=(in_ch if i == 0 else out_ch), out_ch=out_ch, stride=(stride if i == 0 else 1), dilation=dilation * multi_grids[i], downsample=(True if i == 0 else False), ), ) class _Stem(nn.Sequential): """ The 1st conv layer. Note that the max pooling is different from both MSRA and FAIR ResNet. """ def __init__(self, out_ch): super(_Stem, self).__init__() self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1)) self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True)) if __name__ == "__main__": model = DeepLabV2( n_classes=21, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24] ) model.eval() image = torch.randn(1, 3, 513, 513) print(model) print("input:", image.shape) print("output:", model(image).shape)
可以看到DeepLab V2的程式碼除了ASPP模組,其他部分和V1完全一樣,所以就沒什麼好解釋的了。但需要注意的一個點是,訓練的時候,DeepLabV2的學習率採用了Poly的策略,公式為: ,當時,模型可以取得不普通的分段學習策略MAP值高1.17%的效果。這部分作者也在他的程式碼中實現了,如下所示:
作者:Uno Whoiam 鏈接:https://zhuanlan.zhihu.com/p/68531147 來源:知乎 著作權歸作者所有。商業轉載請聯繫作者獲得授權,非商業轉載請註明出處。 from torch.optim.lr_scheduler import _LRScheduler class PolynomialLR(_LRScheduler): def __init__(self, optimizer, step_size, iter_max, power, last_epoch=-1): self.step_size = step_size self.iter_max = iter_max self.power = power super(PolynomialLR, self).__init__(optimizer, last_epoch) def polynomial_decay(self, lr): return lr * (1 - float(self.last_epoch) / self.iter_max) ** self.power def get_lr(self): if ( (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0) or (self.last_epoch > self.iter_max) ): return [group["lr"] for group in self.optimizer.param_groups] return [self.polynomial_decay(lr) for lr in self.base_lrs]
可以看到這個類是直接繼承了Pytorch中的學習率調整類_LRScheduler,可以方便的在每個epoch進行學習率調整。
最後網路的輸入解析度和輸出解析度和DeepLab V1一樣,具體訓練和數據製作請看作者的github工程:https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models 。
DeepLab V3源碼分析
DeepLab V3論文原理請看我之前發的推文:https://mp.weixin.qq.com/s/D9OX89mklaU4tv74OZMqNg 。這裡再簡單回歸一下DeepLab V3使用的關鍵Trick。
- 將BN層加到了ASPP模組中。
- 使用了Multi-Grid策略,即在模型後端多加幾層不同rate的空洞卷積。
- 具有不同 atrous rates 的 ASPP 能夠有效的捕獲多尺度資訊。不過,論文發現,隨著sampling rate的增加,有效filter特徵權重(即有效特徵區域,而不是補零區域的權重)的數量會變小,極端情況下,當空洞卷積的 rate 和 feature map 的大小一致時,卷積會退化為卷積。為了解決這一問題,並將全局內容資訊整合到模型中,則採用影像級特徵。即,採用全局平均池化(global average pooling)對模型的 feature map 進行處理,將得到的影像級特徵輸入到一個 1×1 convolution with 256 filters(加入 batch normalization)中,然後將特徵進行雙線性上取樣(bilinearly upsample)到特定的空間維度。
DeepLab V3的源碼如下:
from __future__ import absolute_import, print_function from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F # 全局平均池化,將得到的影像特徵輸入到一個擁有256個通道的1*1卷積中,最後將特徵進行 # 雙線性上取樣到特定的維度(就是輸入到ImagePool之前特徵圖的維度) class _ImagePool(nn.Module): def __init__(self, in_ch, out_ch): super().__init__() self.pool = nn.AdaptiveAvgPool2d(1) self.conv = _ConvBnReLU(in_ch, out_ch, 1, 1, 0, 1) def forward(self, x): _, _, H, W = x.shape h = self.pool(x) h = self.conv(h) h = F.interpolate(h, size=(H, W), mode="bilinear", align_corners=False) return h # ASPP模組,DeepLabV3改進後的,新增了1*1卷積以及影像全局池化。 class _ASPP(nn.Module): """ Atrous spatial pyramid pooling with image-level feature """ def __init__(self, in_ch, out_ch, rates): super(_ASPP, self).__init__() self.stages = nn.Module() self.stages.add_module("c0", _ConvBnReLU(in_ch, out_ch, 1, 1, 0, 1)) for i, rate in enumerate(rates): self.stages.add_module( "c{}".format(i + 1), _ConvBnReLU(in_ch, out_ch, 3, 1, padding=rate, dilation=rate), ) self.stages.add_module("imagepool", _ImagePool(in_ch, out_ch)) def forward(self, x): return torch.cat([stage(x) for stage in self.stages.children()], dim=1) # 完整的DeepLabV3的結構,使用帶空洞卷積的ResNet+multi-grid策略+改進後的ASPP class DeepLabV3(nn.Sequential): """ DeepLab v3: Dilated ResNet with multi-grid + improved ASPP """ def __init__(self, n_classes, n_blocks, atrous_rates, multi_grids, output_stride): super(DeepLabV3, self).__init__() # Stride and dilation if output_stride == 8: s = [1, 2, 1, 1] d = [1, 1, 2, 4] elif output_stride == 16: s = [1, 2, 2, 1] d = [1, 1, 1, 2] ch = [64 * 2 ** p for p in range(6)] self.add_module("layer1", _Stem(ch[0])) self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], s[0], d[0])) self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], s[1], d[1])) self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], s[2], d[2])) self.add_module( "layer5", _ResLayer(n_blocks[3], ch[4], ch[5], s[3], d[3], multi_grids) ) self.add_module("aspp", _ASPP(ch[5], 256, atrous_rates)) # 連接所有分支的最終特徵,輸入到256個通道的1*1卷積中,並加入BN,再進入最終的1*1卷積, # 得到logits結果。 concat_ch = 256 * (len(atrous_rates) + 2) self.add_module("fc1", _ConvBnReLU(concat_ch, 256, 1, 1, 0, 1)) self.add_module("fc2", nn.Conv2d(256, n_classes, kernel_size=1)) try: from encoding.nn import SyncBatchNorm _BATCH_NORM = SyncBatchNorm except: _BATCH_NORM = nn.BatchNorm2d _BOTTLENECK_EXPANSION = 4 # 和DeepLabV1定義一樣 class _ConvBnReLU(nn.Sequential): """ Cascade of 2D convolution, batch norm, and ReLU. """ BATCH_NORM = _BATCH_NORM def __init__( self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True ): super(_ConvBnReLU, self).__init__() self.add_module( "conv", nn.Conv2d( in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False ), ) self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999)) if relu: self.add_module("relu", nn.ReLU()) class _Bottleneck(nn.Module): """ Bottleneck block of MSRA ResNet. """ def __init__(self, in_ch, out_ch, stride, dilation, downsample): super(_Bottleneck, self).__init__() mid_ch = out_ch // _BOTTLENECK_EXPANSION self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True) self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True) self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False) self.shortcut = ( _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False) if downsample else lambda x: x # identity ) def forward(self, x): h = self.reduce(x) h = self.conv3x3(h) h = self.increase(h) h += self.shortcut(x) return F.relu(h) class _ResLayer(nn.Sequential): """ Residual layer with multi grids """ def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None): super(_ResLayer, self).__init__() if multi_grids is None: multi_grids = [1 for _ in range(n_layers)] else: assert n_layers == len(multi_grids) # Downsampling is only in the first block for i in range(n_layers): self.add_module( "block{}".format(i + 1), _Bottleneck( in_ch=(in_ch if i == 0 else out_ch), out_ch=out_ch, stride=(stride if i == 0 else 1), dilation=dilation * multi_grids[i], downsample=(True if i == 0 else False), ), ) class _Stem(nn.Sequential): """ The 1st conv layer. Note that the max pooling is different from both MSRA and FAIR ResNet. """ def __init__(self, out_ch): super(_Stem, self).__init__() self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1)) self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True)) if __name__ == "__main__": model = DeepLabV3( n_classes=21, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18], multi_grids=[1, 2, 4], output_stride=8, ) model.eval() image = torch.randn(1, 3, 513, 513) print(model) print("input:", image.shape) print("output:", model(image).shape)
和V1,V2的區別在源碼里詳細注釋了。最後DeepLab V3得到輸出結果和V1/V2得到輸出結果是一致的,訓練標籤的設置也是一致的。
結論
通過源碼解析,應該可以對DeepLab V1,V2,V3的原理和特徵圖維度變化以及 訓練有清楚的認識了,所以暫時就講到這裡了。之後有時間再補上DeepLab V3 Plus的論文理解和源碼解析語義分割就算暫時完結了。之後準備做目標檢測/分類網路的解析,敬請期待吧。
程式碼鏈接
https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models