您现在的位置是：首页 > 文章详情

Swin-Unet最强分割网络

日期：2022-06-08点击：1213收藏

Swin-Unet是基于Swin Transformer为基础(可参考Swin Transformer介绍 )，结合了U-Net网络的特点(可参考Tensorflow深度学习算法整理(三) 中的U-Net)组合而成的新的分割网络

它与Swin Transformer不同的地方在于，在编码器(Encoder)这边虽然跟Swin Transformer一样的4个Stage，但Swin Transformer Block的数量为[2,2,2,1]，而不是Swin Transformer的[2,2,6,2]。而在解码器(Decoder)这边,由于是升采样，使用的不再是Patch Embedding和Patch Merging，而使用的是Patch Expanding，它是Patch Merging的逆过程。

我们来看一下Patch Expanding的代码实现

 from einops import rearrange

 class PatchExpand(nn.Module): """  块状扩充，尺寸翻倍，通道数减半  """  def __init__(self, input_resolution, dim, dim_scale=2, norm_layer=nn.LayerNorm): """  Args:  input_resolution: 解码过程的feature map的宽高  dim: frature map通道数  dim_scale: 通道数扩充的倍数  norm_layer: 通道方向归一化  """  super().__init__() self.input_resolution = input_resolution self.dim = dim # 通过全连接层来扩大通道数  self.expand = nn.Linear(dim, 2 * dim, bias=False) if dim_scale == 2 else nn.Identity() self.norm = norm_layer(dim // dim_scale) def forward(self, x): """  x: B, H*W, C  """  H, W = self.input_resolution # 先把通道数翻倍  x = self.expand(x) B, L, C = x.shape assert L == H * W, "input feature has wrong size"   x = x.view(B, H, W, C) # 将各个通道分开，再将所有通道拼成一个feature map  # 增大了feature map的尺寸  x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=2, p2=2, c=C // 4) # 通道翻倍后再除以4，实际相当于通道数减半  x = x.view(B, -1, C // 4) x = self.norm(x) return x

在编码器这边基本上跟Swin Transformer是一样的，我们重点来看解码器这边。它是使用BasicLayer_up类来对SwinTransformerBlock和Patch Expanding来进行搭配的。

 class BasicLayer_up(nn.Module): """ A basic Swin Transformer layer for one stage.  一个BasicLayer_up包含偶数个SwinTransformerBlock和一个upsamele层（即Patch Expanding层）  """   def __init__(self, dim, input_resolution, depth, num_heads, window_size,  mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,  drop_path=0., norm_layer=nn.LayerNorm, upsample=None, use_checkpoint=False): """  Args:  dim: feature map通道数  input_resolution: feature map的宽高  depth: 各个Stage中，Swin Transformer Block的数量  num_heads: 多头注意力各个Stage中的头数  window_size: 窗口自注意力机制的窗口中的patch数  mlp_ratio: 层感知机模块中第一个全连接层输出的通道倍数  qkv_bias: 如果是True的话，对自注意力公式中的Q、K、V增加一个可学习的偏置  qk_scale: 窗口自注意力公式常数  drop: dropout rate，默认为0  attn_drop: 用于自注意力机制中的dropout rate，默认为0  drop_path: 在Swin Transformer Block中，有一定概率丢弃整个直连分支，包括  LN、W-MSA或者SW-MSA，只保留直连的连接，是一种网络深度的随机性，默认为0  norm_layer: 通道方向归一化  upsample: 使用Patch Expanding来升采样  use_checkpoint: 是否使用Pytorch中间数据保存机制  """   super().__init__() self.dim = dim self.input_resolution = input_resolution self.depth = depth self.use_checkpoint = use_checkpoint # build SwinTransformerBlock  self.blocks = nn.ModuleList([ SwinTransformerBlock(dim=dim, input_resolution=input_resolution,  num_heads=num_heads, window_size=window_size,  # 用于区分是使用W-MSA还是SW-MSA，0为W-MSA，1为SW-MSA  shift_size=0 if (i % 2 == 0) else window_size // 2,  mlp_ratio=mlp_ratio,  qkv_bias=qkv_bias, qk_scale=qk_scale,  drop=drop, attn_drop=attn_drop,  drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,  norm_layer=norm_layer) for i in range(depth)]) # patch merging layer  # 当stage=4的时候为None  if upsample is not None: self.upsample = PatchExpand(input_resolution, dim=dim, dim_scale=2, norm_layer=norm_layer) else: self.upsample = None   def forward(self, x): # 通过每一个SwinTransformerBlock  for blk in self.blocks: if self.use_checkpoint: x = checkpoint.checkpoint(blk, x) else: x = blk(x) # 进行块状扩充(PatchExpanding)上采样  if self.upsample is not None: x = self.upsample(x) return x

SwinTransformerBlock跟SwinTransformer中的代码也是一样的，这里就不重复了。

然后还有一个从编码器到解码器之间的跳连。这里需要看一下Swin-Unet的主类代码

 class SwinTransformerSys(nn.Module): """ Swin-UNet网络模型  """   def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,  embed_dim=96, depths=[2, 2, 2, 2], depths_decoder=[1, 2, 2, 2], num_heads=[3, 6, 12, 24],  window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,  drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,  norm_layer=nn.LayerNorm, ape=False, patch_norm=True,  use_checkpoint=False, final_upsample="expand_first", **kwargs): """  Args:  img_size: 原始图像尺寸  patch_size: 一个patch中的像素点数  in_chans: 进入网络的图片通道数  num_classes: 分类数量  embed_dim: feature map通道数  depths: 编码器各个Stage中，Swin Transformer Block的数量  depths_decoder: 解码器各个Stage中，Swin Transformer Block的数量  num_heads: 多头注意力各个Stage中的头数  window_size: 窗口自注意力机制的窗口中的patch数  mlp_ratio: 多层感知机模块中第一个全连接层输出的通道倍数  qkv_bias: 如果是True的话，对自注意力公式中的Q、K、V增加一个可学习的偏置  qk_scale: 自注意力公式中的常量  drop_rate: dropout rate，默认为0  attn_drop_rate: 用于自注意力机制中的dropout rate，默认为0  drop_path_rate: 在Swin Transformer Block中，有一定概率丢弃整个直连分支，包括  LN、W-MSA或者SW-MSA，只保留直连的连接，是一种网络深度的随机性，默认为0.1  norm_layer: 通道方向归一化  ape: 是否进行绝对位置嵌入，默认False  patch_norm: 如果是True的话，在patch embedding之后加上归一化  use_checkpoint: 是否使用Pytorch中间数据保存机制  final_upsample: 解码器stage4后的Patch Expanding  **kwargs:  """  super().__init__() print("SwinTransformerSys expand initial----depths:{};depths_decoder:{};drop_path_rate:{};num_classes:{}".format(depths,  depths_decoder, drop_path_rate, num_classes)) self.num_classes = num_classes # stage的数量  self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm # 编码器stage4输出特征的通道数(Swin-Tiny:768)  self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) # 解码器stage4输出特征的通道数(192)  self.num_features_up = int(embed_dim * 2) self.mlp_ratio = mlp_ratio self.final_upsample = final_upsample # 把图像分割成不重叠的patch  self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,  norm_layer=norm_layer if self.patch_norm else None) num_patches = self.patch_embed.num_patches # 获取feature map的高宽  patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # 绝对位置嵌入  if self.ape: self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # 不同的stage，舍弃整个直连分支的概率不同，从小到大，最小为0，最大为0.1  dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule   # 创建编码器layers  self.layers = nn.ModuleList() for i_layer in range(self.num_layers): # layer相当于stage  layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),  input_resolution=(patches_resolution[0] // (2 ** i_layer),  patches_resolution[1] // (2 ** i_layer)),  depth=depths[i_layer],  num_heads=num_heads[i_layer],  window_size=window_size,  mlp_ratio=self.mlp_ratio,  qkv_bias=qkv_bias, qk_scale=qk_scale,  drop=drop_rate, attn_drop=attn_drop_rate,  drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  norm_layer=norm_layer,  # 只有前3个stage有patchmerging，最后一个没有  downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,  use_checkpoint=use_checkpoint) self.layers.append(layer) # 创建解码器layers  self.layers_up = nn.ModuleList() self.concat_back_dim = nn.ModuleList() for i_layer in range(self.num_layers): # layer相当于stage  # 每一个stage结束后，通道数减半的全连接层  concat_linear = nn.Linear(2 * int(embed_dim * 2**(self.num_layers - 1 - i_layer)),  int(embed_dim * 2**(self.num_layers - 1 - i_layer))) if i_layer > 0 else nn.Identity() if i_layer == 0: # 第一个stage只进行上采样  layer_up = PatchExpand(input_resolution=(patches_resolution[0] // (2 ** (self.num_layers - 1 - i_layer)),  patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))), dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)), dim_scale=2, norm_layer=norm_layer) else: layer_up = BasicLayer_up(dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)),  input_resolution=(patches_resolution[0] // (2 ** (self.num_layers-1-i_layer)),  patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))),  depth=depths[(self.num_layers-1-i_layer)],  num_heads=num_heads[(self.num_layers-1-i_layer)],  window_size=window_size,  mlp_ratio=self.mlp_ratio,  qkv_bias=qkv_bias, qk_scale=qk_scale,  drop=drop_rate, attn_drop=attn_drop_rate,  drop_path=dpr[sum(depths[:(self.num_layers-1-i_layer)]):sum(depths[:(self.num_layers - 1 - i_layer) + 1])],  norm_layer=norm_layer,  # 只有前3个stage有PatchExpand，最后一个没有  upsample=PatchExpand if (i_layer < self.num_layers - 1) else None,  use_checkpoint=use_checkpoint) self.layers_up.append(layer_up) self.concat_back_dim.append(concat_linear) self.norm = norm_layer(self.num_features) self.norm_up = norm_layer(self.embed_dim) # 解码器最后一个stage进行FinalPatchExpand处理  if self.final_upsample == "expand_first": print("---final upsample expand_first---") self.up = FinalPatchExpand_X4(input_resolution=(img_size // patch_size, img_size // patch_size), dim_scale=4, dim=embed_dim) self.output = nn.Conv2d(in_channels=embed_dim, out_channels=self.num_classes, kernel_size=1, bias=False) self.apply(self._init_weights)

这里有一个FinalPatchExpand_X4的方法，我们来看一下它的实现

 class FinalPatchExpand_X4(nn.Module): """  stage4之后的PatchExpand  尺寸翻倍，通道数不变  """  def __init__(self, input_resolution, dim, dim_scale=4, norm_layer=nn.LayerNorm): """  Args:  input_resolution: feature map的宽高  dim: feature map通道数  dim_scale: 通道数扩充的倍数  norm_layer: 通道方向归一化  """  super().__init__() self.input_resolution = input_resolution self.dim = dim self.dim_scale = dim_scale # 通过全连接层来扩大通道数  self.expand = nn.Linear(dim, 16 * dim, bias=False) self.output_dim = dim self.norm = norm_layer(self.output_dim) def forward(self, x): """  x: B, H*W, C  """  H, W = self.input_resolution # 先把通道数翻倍  x = self.expand(x) B, L, C = x.shape assert L == H * W, "input feature has wrong size"   x = x.view(B, H, W, C) # 将各个通道分开，再将所有通道拼成一个feature map  # 增大了feature map的尺寸  x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=self.dim_scale, p2=self.dim_scale, c=C//(self.dim_scale**2)) # 把扩大的通道数转成原来的通道数  x = x.view(B, -1, self.output_dim) x = self.norm(x) return x

回到SwinTransformerSys代码中

 def _init_weights(self, m): """  对全连接层或者通道归一化进行权重以及偏置的初始化  """  if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'absolute_pos_embed'} @torch.jit.ignore def no_weight_decay_keywords(self): return {'relative_position_bias_table'} #Encoder and Bottleneck def forward_features(self, x): """  编码器过程  """  # 图像分割  x = self.patch_embed(x) # 绝对位置嵌入  if self.ape: x = x + self.absolute_pos_embed x = self.pos_drop(x) # 跳连点  x_downsample = [] # 通过各个编码过程的stage  for layer in self.layers: x_downsample.append(x) x = layer(x) x = self.norm(x) # B L C   return x, x_downsample #Dencoder and Skip connection def forward_up_features(self, x, x_downsample): """  解码器过程，包含了跳连拼接  """  # 通过各个解码过程的stage  for inx, layer_up in enumerate(self.layers_up): if inx == 0: x = layer_up(x) else: # 拼接编码器的跳连部分再进入Swin Transformer Block  x = torch.cat([x, x_downsample[3-inx]], -1) x = self.concat_back_dim[inx](x) x = layer_up(x) x = self.norm_up(x) # B L C   return x def up_x4(self, x): """  完成解码器的最后一个stage后进入  """  H, W = self.patches_resolution B, L, C = x.shape assert L == H * W, "input features has wrong size"   if self.final_upsample == "expand_first": x = self.up(x) x = x.view(B, 4 * H, 4 * W, -1) x = x.permute(0, 3, 1, 2) #B,C,H,W  x = self.output(x) return x def forward(self, x): """  前向运算  """  x, x_downsample = self.forward_features(x) x = self.forward_up_features(x, x_downsample) x = self.up_x4(x) return x def flops(self): flops = 0  flops += self.patch_embed.flops() for i, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops