from collections import OrderedDict import torch import torch.nn as nn from nets.darknet import darknet53 def conv2d(filter_in, filter_out, kernel_size): pad = (kernel_size - 1) // 2 if kernel_size else 0 return nn.Sequential(OrderedDict([ ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)), ("bn", nn.BatchNorm2d(filter_out)), ("relu", nn.LeakyReLU(0.1)), ])) # ------------------------------------------------------------------------# # make_last_layers里面一共有七个卷积,前五个用于提取特征。 # 后两个用于获得yolo网络的预测结果 # ------------------------------------------------------------------------# def make_last_layers(filters_list, in_filters, out_filter): m = nn.Sequential( conv2d(in_filters, filters_list[0], 1), # 多次使用 1*1 的卷积调整通道,并进行通道方向的信息融合 conv2d(filters_list[0], filters_list[1], 3), conv2d(filters_list[1], filters_list[0], 1), conv2d(filters_list[0], filters_list[1], 3), conv2d(filters_list[1], filters_list[0], 1), conv2d(filters_list[0], filters_list[1], 3), nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True) ) return m class YoloBody(nn.Module): def __init__(self, anchors_mask, num_classes, pretrained=False): super(YoloBody, self).__init__() self.width = 416 # 临时加 self.height = 416 # 临时加 # ---------------------------------------------------# # 生成darknet53的主干模型 # 获得三个有效特征层,他们的shape分别是: # 52,52,256 # 26,26,512 # 13,13,1024 # ---------------------------------------------------# self.backbone = darknet53() if pretrained: # 载入预训练的权重,darknet53是一个分类网络 self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth")) # ---------------------------------------------------# # out_filters : [64, 128, 256, 512, 1024] # ---------------------------------------------------# out_filters = self.backbone.layers_out_filters # ------------------------------------------------------------------------# # 计算yolo_head的输出通道数,对于voc数据集而言 # final_out_filter0 = final_out_filter1 = final_out_filter2 = 75 # ------------------------------------------------------------------------# len(anchors_mask[0]) 为 3 self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5)) self.last_layer1_conv = conv2d(512, 256, 1) self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest') self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5)) self.last_layer2_conv = conv2d(256, 128, 1) self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest') self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5)) def forward(self, x): # ---------------------------------------------------# # 获得三个有效特征层,他们的shape分别是: # 52,52,256;26,26,512;13,13,1024 # ---------------------------------------------------# x2, x1, x0 = self.backbone(x) # ---------------------------------------------------# # 第一个特征层 # out0 = (batch_size,255,13,13) # ---------------------------------------------------# # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 out0_branch = self.last_layer0[:5](x0) out0 = self.last_layer0[5:](out0_branch) # 8, 75, 13, 13 刚开始的2是测试用的,不是正式数据 # 13,13,512 -> 13,13,256 -> 26,26,256 x1_in = self.last_layer1_conv(out0_branch) # 融合分支 x1_in = self.last_layer1_upsample(x1_in) # 26,26,256 + 26,26,512 -> 26,26,768 x1_in = torch.cat([x1_in, x1], 1) # ---------------------------------------------------# # 第二个特征层 # out1 = (batch_size,255,26,26) # ---------------------------------------------------# # 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 out1_branch = self.last_layer1[:5](x1_in) out1 = self.last_layer1[5:](out1_branch) # 26,26,256 -> 26,26,128 -> 52,52,128 x2_in = self.last_layer2_conv(out1_branch) # 融合 x2_in = self.last_layer2_upsample(x2_in) # 52,52,128 + 52,52,256 -> 52,52,384 x2_in = torch.cat([x2_in, x2], 1) # ---------------------------------------------------# # 第三个特征层 # out3 = (batch_size,255,52,52) # ---------------------------------------------------# # 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 out2 = self.last_layer2(x2_in) return out0, out1, out2