112 lines
5.1 KiB
Python
112 lines
5.1 KiB
Python
from collections import OrderedDict
|
||
|
||
import torch
|
||
import torch.nn as nn
|
||
|
||
from nets.darknet import darknet53
|
||
|
||
|
||
def conv2d(filter_in, filter_out, kernel_size):
|
||
pad = (kernel_size - 1) // 2 if kernel_size else 0
|
||
return nn.Sequential(OrderedDict([
|
||
("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
|
||
("bn", nn.BatchNorm2d(filter_out)),
|
||
("relu", nn.LeakyReLU(0.1)),
|
||
]))
|
||
|
||
|
||
# ------------------------------------------------------------------------#
|
||
# make_last_layers里面一共有七个卷积,前五个用于提取特征。
|
||
# 后两个用于获得yolo网络的预测结果
|
||
# ------------------------------------------------------------------------#
|
||
def make_last_layers(filters_list, in_filters, out_filter):
|
||
m = nn.Sequential(
|
||
conv2d(in_filters, filters_list[0], 1), # 多次使用 1*1 的卷积调整通道,并进行通道方向的信息融合
|
||
conv2d(filters_list[0], filters_list[1], 3),
|
||
conv2d(filters_list[1], filters_list[0], 1),
|
||
conv2d(filters_list[0], filters_list[1], 3),
|
||
conv2d(filters_list[1], filters_list[0], 1),
|
||
conv2d(filters_list[0], filters_list[1], 3),
|
||
nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
|
||
)
|
||
return m
|
||
|
||
|
||
class YoloBody(nn.Module):
|
||
def __init__(self, anchors_mask, num_classes, pretrained=False):
|
||
super(YoloBody, self).__init__()
|
||
self.width = 416 # 临时加
|
||
self.height = 416 # 临时加
|
||
# ---------------------------------------------------#
|
||
# 生成darknet53的主干模型
|
||
# 获得三个有效特征层,他们的shape分别是:
|
||
# 52,52,256
|
||
# 26,26,512
|
||
# 13,13,1024
|
||
# ---------------------------------------------------#
|
||
self.backbone = darknet53()
|
||
if pretrained: # 载入预训练的权重,darknet53是一个分类网络
|
||
self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth"))
|
||
|
||
# ---------------------------------------------------#
|
||
# out_filters : [64, 128, 256, 512, 1024]
|
||
# ---------------------------------------------------#
|
||
out_filters = self.backbone.layers_out_filters
|
||
|
||
# ------------------------------------------------------------------------#
|
||
# 计算yolo_head的输出通道数,对于voc数据集而言
|
||
# final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
|
||
# ------------------------------------------------------------------------# len(anchors_mask[0]) 为 3
|
||
self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
|
||
|
||
self.last_layer1_conv = conv2d(512, 256, 1)
|
||
self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
|
||
self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
|
||
|
||
self.last_layer2_conv = conv2d(256, 128, 1)
|
||
self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
|
||
self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
|
||
|
||
def forward(self, x):
|
||
# ---------------------------------------------------#
|
||
# 获得三个有效特征层,他们的shape分别是:
|
||
# 52,52,256;26,26,512;13,13,1024
|
||
# ---------------------------------------------------#
|
||
x2, x1, x0 = self.backbone(x)
|
||
|
||
# ---------------------------------------------------#
|
||
# 第一个特征层
|
||
# out0 = (batch_size,255,13,13)
|
||
# ---------------------------------------------------#
|
||
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
|
||
out0_branch = self.last_layer0[:5](x0)
|
||
out0 = self.last_layer0[5:](out0_branch) # 8, 75, 13, 13 刚开始的2是测试用的,不是正式数据
|
||
|
||
# 13,13,512 -> 13,13,256 -> 26,26,256
|
||
x1_in = self.last_layer1_conv(out0_branch) # 融合分支
|
||
x1_in = self.last_layer1_upsample(x1_in)
|
||
|
||
# 26,26,256 + 26,26,512 -> 26,26,768
|
||
x1_in = torch.cat([x1_in, x1], 1)
|
||
# ---------------------------------------------------#
|
||
# 第二个特征层
|
||
# out1 = (batch_size,255,26,26)
|
||
# ---------------------------------------------------#
|
||
# 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
|
||
out1_branch = self.last_layer1[:5](x1_in)
|
||
out1 = self.last_layer1[5:](out1_branch)
|
||
|
||
# 26,26,256 -> 26,26,128 -> 52,52,128
|
||
x2_in = self.last_layer2_conv(out1_branch) # 融合
|
||
x2_in = self.last_layer2_upsample(x2_in)
|
||
|
||
# 52,52,128 + 52,52,256 -> 52,52,384
|
||
x2_in = torch.cat([x2_in, x2], 1)
|
||
# ---------------------------------------------------#
|
||
# 第三个特征层
|
||
# out3 = (batch_size,255,52,52)
|
||
# ---------------------------------------------------#
|
||
# 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
|
||
out2 = self.last_layer2(x2_in)
|
||
return out0, out1, out2
|