Swin-Unet是基于Swin Transformer为基础(可参考Swin Transformer介绍 ),结合了U-Net网络的特点(可参考Tensorflow深度学习算法整理(三) 中的U-Net)组合而成的新的分割网络
![]()
它与Swin Transformer不同的地方在于,在编码器(Encoder)这边虽然跟Swin Transformer一样的4个Stage,但Swin Transformer Block的数量为[2,2,2,1],而不是Swin Transformer的[2,2,6,2]。而在解码器(Decoder)这边,由于是升采样,使用的不再是Patch Embedding和Patch Merging,而使用的是Patch Expanding,它是Patch Merging的逆过程。
我们来看一下Patch Expanding的代码实现
from einops import rearrange
class PatchExpand(nn.Module):
"""
块状扩充,尺寸翻倍,通道数减半
"""
def __init__(self, input_resolution, dim, dim_scale=2, norm_layer=nn.LayerNorm):
"""
Args:
input_resolution: 解码过程的feature map的宽高
dim: frature map通道数
dim_scale: 通道数扩充的倍数
norm_layer: 通道方向归一化
"""
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
# 通过全连接层来扩大通道数
self.expand = nn.Linear(dim, 2 * dim, bias=False) if dim_scale == 2 else nn.Identity()
self.norm = norm_layer(dim // dim_scale)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
# 先把通道数翻倍
x = self.expand(x)
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.view(B, H, W, C)
# 将各个通道分开,再将所有通道拼成一个feature map
# 增大了feature map的尺寸
x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=2, p2=2, c=C // 4)
# 通道翻倍后再除以4,实际相当于通道数减半
x = x.view(B, -1, C // 4)
x = self.norm(x)
return x
在编码器这边基本上跟Swin Transformer是一样的,我们重点来看解码器这边。它是使用BasicLayer_up类来对SwinTransformerBlock和Patch Expanding来进行搭配的。
class BasicLayer_up(nn.Module):
""" A basic Swin Transformer layer for one stage.
一个BasicLayer_up包含偶数个SwinTransformerBlock和一个upsamele层(即Patch Expanding层)
"""
def __init__(self, dim, input_resolution, depth, num_heads, window_size,
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., norm_layer=nn.LayerNorm, upsample=None, use_checkpoint=False):
"""
Args:
dim: feature map通道数
input_resolution: feature map的宽高
depth: 各个Stage中,Swin Transformer Block的数量
num_heads: 多头注意力各个Stage中的头数
window_size: 窗口自注意力机制的窗口中的patch数
mlp_ratio: 层感知机模块中第一个全连接层输出的通道倍数
qkv_bias: 如果是True的话,对自注意力公式中的Q、K、V增加一个可学习的偏置
qk_scale: 窗口自注意力公式常数
drop: dropout rate,默认为0
attn_drop: 用于自注意力机制中的dropout rate,默认为0
drop_path: 在Swin Transformer Block中,有一定概率丢弃整个直连分支,包括
LN、W-MSA或者SW-MSA,只保留直连的连接,是一种网络深度的随机性,默认为0
norm_layer: 通道方向归一化
upsample: 使用Patch Expanding来升采样
use_checkpoint: 是否使用Pytorch中间数据保存机制
"""
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build SwinTransformerBlock
self.blocks = nn.ModuleList([
SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
num_heads=num_heads, window_size=window_size,
# 用于区分是使用W-MSA还是SW-MSA,0为W-MSA,1为SW-MSA
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop, attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer)
for i in range(depth)])
# patch merging layer
# 当stage=4的时候为None
if upsample is not None:
self.upsample = PatchExpand(input_resolution, dim=dim, dim_scale=2, norm_layer=norm_layer)
else:
self.upsample = None
def forward(self, x):
# 通过每一个SwinTransformerBlock
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)
# 进行块状扩充(PatchExpanding)上采样
if self.upsample is not None:
x = self.upsample(x)
return x
SwinTransformerBlock跟SwinTransformer中的代码也是一样的,这里就不重复了。
然后还有一个从编码器到解码器之间的跳连。这里需要看一下Swin-Unet的主类代码
class SwinTransformerSys(nn.Module):
""" Swin-UNet网络模型
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
embed_dim=96, depths=[2, 2, 2, 2], depths_decoder=[1, 2, 2, 2], num_heads=[3, 6, 12, 24],
window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
use_checkpoint=False, final_upsample="expand_first", **kwargs):
"""
Args:
img_size: 原始图像尺寸
patch_size: 一个patch中的像素点数
in_chans: 进入网络的图片通道数
num_classes: 分类数量
embed_dim: feature map通道数
depths: 编码器各个Stage中,Swin Transformer Block的数量
depths_decoder: 解码器各个Stage中,Swin Transformer Block的数量
num_heads: 多头注意力各个Stage中的头数
window_size: 窗口自注意力机制的窗口中的patch数
mlp_ratio: 多层感知机模块中第一个全连接层输出的通道倍数
qkv_bias: 如果是True的话,对自注意力公式中的Q、K、V增加一个可学习的偏置
qk_scale: 自注意力公式中的常量
drop_rate: dropout rate,默认为0
attn_drop_rate: 用于自注意力机制中的dropout rate,默认为0
drop_path_rate: 在Swin Transformer Block中,有一定概率丢弃整个直连分支,包括
LN、W-MSA或者SW-MSA,只保留直连的连接,是一种网络深度的随机性,默认为0.1
norm_layer: 通道方向归一化
ape: 是否进行绝对位置嵌入,默认False
patch_norm: 如果是True的话,在patch embedding之后加上归一化
use_checkpoint: 是否使用Pytorch中间数据保存机制
final_upsample: 解码器stage4后的Patch Expanding
**kwargs:
"""
super().__init__()
print("SwinTransformerSys expand initial----depths:{};depths_decoder:{};drop_path_rate:{};num_classes:{}".format(depths,
depths_decoder, drop_path_rate, num_classes))
self.num_classes = num_classes
# stage的数量
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
# 编码器stage4输出特征的通道数(Swin-Tiny:768)
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
# 解码器stage4输出特征的通道数(192)
self.num_features_up = int(embed_dim * 2)
self.mlp_ratio = mlp_ratio
self.final_upsample = final_upsample
# 把图像分割成不重叠的patch
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
num_patches = self.patch_embed.num_patches
# 获取feature map的高宽
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# 绝对位置嵌入
if self.ape:
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
trunc_normal_(self.absolute_pos_embed, std=.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# 不同的stage,舍弃整个直连分支的概率不同,从小到大,最小为0,最大为0.1
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# 创建编码器layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers): # layer相当于stage
layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
input_resolution=(patches_resolution[0] // (2 ** i_layer),
patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
# 只有前3个stage有patchmerging,最后一个没有
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint)
self.layers.append(layer)
# 创建解码器layers
self.layers_up = nn.ModuleList()
self.concat_back_dim = nn.ModuleList()
for i_layer in range(self.num_layers): # layer相当于stage
# 每一个stage结束后,通道数减半的全连接层
concat_linear = nn.Linear(2 * int(embed_dim * 2**(self.num_layers - 1 - i_layer)),
int(embed_dim * 2**(self.num_layers - 1 - i_layer))) if i_layer > 0 else nn.Identity()
if i_layer == 0: # 第一个stage只进行上采样
layer_up = PatchExpand(input_resolution=(patches_resolution[0] // (2 ** (self.num_layers - 1 - i_layer)),
patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))), dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)), dim_scale=2, norm_layer=norm_layer)
else:
layer_up = BasicLayer_up(dim=int(embed_dim * 2 ** (self.num_layers-1-i_layer)),
input_resolution=(patches_resolution[0] // (2 ** (self.num_layers-1-i_layer)),
patches_resolution[1] // (2 ** (self.num_layers-1-i_layer))),
depth=depths[(self.num_layers-1-i_layer)],
num_heads=num_heads[(self.num_layers-1-i_layer)],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:(self.num_layers-1-i_layer)]):sum(depths[:(self.num_layers - 1 - i_layer) + 1])],
norm_layer=norm_layer,
# 只有前3个stage有PatchExpand,最后一个没有
upsample=PatchExpand if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint)
self.layers_up.append(layer_up)
self.concat_back_dim.append(concat_linear)
self.norm = norm_layer(self.num_features)
self.norm_up = norm_layer(self.embed_dim)
# 解码器最后一个stage进行FinalPatchExpand处理
if self.final_upsample == "expand_first":
print("---final upsample expand_first---")
self.up = FinalPatchExpand_X4(input_resolution=(img_size // patch_size, img_size // patch_size), dim_scale=4, dim=embed_dim)
self.output = nn.Conv2d(in_channels=embed_dim, out_channels=self.num_classes, kernel_size=1, bias=False)
self.apply(self._init_weights)
这里有一个FinalPatchExpand_X4的方法,我们来看一下它的实现
class FinalPatchExpand_X4(nn.Module):
"""
stage4之后的PatchExpand
尺寸翻倍,通道数不变
"""
def __init__(self, input_resolution, dim, dim_scale=4, norm_layer=nn.LayerNorm):
"""
Args:
input_resolution: feature map的宽高
dim: feature map通道数
dim_scale: 通道数扩充的倍数
norm_layer: 通道方向归一化
"""
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.dim_scale = dim_scale
# 通过全连接层来扩大通道数
self.expand = nn.Linear(dim, 16 * dim, bias=False)
self.output_dim = dim
self.norm = norm_layer(self.output_dim)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
# 先把通道数翻倍
x = self.expand(x)
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.view(B, H, W, C)
# 将各个通道分开,再将所有通道拼成一个feature map
# 增大了feature map的尺寸
x = rearrange(x, 'b h w (p1 p2 c)-> b (h p1) (w p2) c', p1=self.dim_scale, p2=self.dim_scale, c=C//(self.dim_scale**2))
# 把扩大的通道数转成原来的通道数
x = x.view(B, -1, self.output_dim)
x = self.norm(x)
return x
回到SwinTransformerSys代码中
def _init_weights(self, m):
"""
对全连接层或者通道归一化进行权重以及偏置的初始化
"""
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
return {'absolute_pos_embed'}
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {'relative_position_bias_table'}
#Encoder and Bottleneck
def forward_features(self, x):
"""
编码器过程
"""
# 图像分割
x = self.patch_embed(x)
# 绝对位置嵌入
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
# 跳连点
x_downsample = []
# 通过各个编码过程的stage
for layer in self.layers:
x_downsample.append(x)
x = layer(x)
x = self.norm(x) # B L C
return x, x_downsample
#Dencoder and Skip connection
def forward_up_features(self, x, x_downsample):
"""
解码器过程,包含了跳连拼接
"""
# 通过各个解码过程的stage
for inx, layer_up in enumerate(self.layers_up):
if inx == 0:
x = layer_up(x)
else:
# 拼接编码器的跳连部分再进入Swin Transformer Block
x = torch.cat([x, x_downsample[3-inx]], -1)
x = self.concat_back_dim[inx](x)
x = layer_up(x)
x = self.norm_up(x) # B L C
return x
def up_x4(self, x):
"""
完成解码器的最后一个stage后进入
"""
H, W = self.patches_resolution
B, L, C = x.shape
assert L == H * W, "input features has wrong size"
if self.final_upsample == "expand_first":
x = self.up(x)
x = x.view(B, 4 * H, 4 * W, -1)
x = x.permute(0, 3, 1, 2) #B,C,H,W
x = self.output(x)
return x
def forward(self, x):
"""
前向运算
"""
x, x_downsample = self.forward_features(x)
x = self.forward_up_features(x, x_downsample)
x = self.up_x4(x)
return x
def flops(self):
flops = 0
flops += self.patch_embed.flops()
for i, layer in enumerate(self.layers):
flops += layer.flops()
flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
flops += self.num_features * self.num_classes
return flops