241 lines
8.9 KiB
Python
241 lines
8.9 KiB
Python
|
# 2022.06.17-Changed for building ViG model
|
||
|
# Huawei Technologies Co., Ltd. <foss@huawei.com>
|
||
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
import math
|
||
|
import torch
|
||
|
import torch.nn as nn
|
||
|
import torch.nn.functional as F
|
||
|
from torch.nn import Sequential as Seq
|
||
|
|
||
|
# from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
||
|
# from timm.models.helpers import load_pretrained
|
||
|
# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
|
||
|
# from timm.models.registry import register_model
|
||
|
|
||
|
from .gcn_lib import DropPath
|
||
|
from .gcn_lib import Grapher, act_layer
|
||
|
from .GAT import GAT, GATopt
|
||
|
# from gcn_lib import DropPath
|
||
|
# from gcn_lib import Grapher, act_layer
|
||
|
# from GAT import GAT, GATopt
|
||
|
|
||
|
def _cfg(url='', **kwargs):
|
||
|
return {
|
||
|
'url': url,
|
||
|
'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
|
||
|
'crop_pct': .9, 'interpolation': 'bicubic',
|
||
|
'first_conv': 'patch_embed.proj', 'classifier': 'head',
|
||
|
**kwargs
|
||
|
}
|
||
|
|
||
|
# def _cfg(url='', **kwargs):
|
||
|
# return {
|
||
|
# 'url': url,
|
||
|
# 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
|
||
|
# 'crop_pct': .9, 'interpolation': 'bicubic',
|
||
|
# 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
|
||
|
# 'first_conv': 'patch_embed.proj', 'classifier': 'head',
|
||
|
# **kwargs
|
||
|
# }
|
||
|
|
||
|
|
||
|
default_cfgs = {
|
||
|
'vig_224_gelu': _cfg(
|
||
|
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
|
||
|
),
|
||
|
'vig_b_224_gelu': _cfg(
|
||
|
crop_pct=0.95, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
|
||
|
),
|
||
|
}
|
||
|
|
||
|
|
||
|
class FFN(nn.Module):
|
||
|
def __init__(self, in_features, hidden_features=None, out_features=None, act='relu', drop_path=0.0):
|
||
|
super().__init__()
|
||
|
out_features = out_features or in_features
|
||
|
hidden_features = hidden_features or in_features
|
||
|
self.fc1 = nn.Sequential(
|
||
|
nn.Conv2d(in_features, hidden_features, 1, stride=1, padding=0),
|
||
|
nn.BatchNorm2d(hidden_features),
|
||
|
)
|
||
|
self.act = act_layer(act)
|
||
|
self.fc2 = nn.Sequential(
|
||
|
nn.Conv2d(hidden_features, out_features, 1, stride=1, padding=0),
|
||
|
nn.BatchNorm2d(out_features),
|
||
|
)
|
||
|
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
||
|
|
||
|
def forward(self, x):
|
||
|
shortcut = x
|
||
|
x = self.fc1(x)
|
||
|
x = self.act(x)
|
||
|
x = self.fc2(x)
|
||
|
x = self.drop_path(x) + shortcut
|
||
|
return x#.reshape(B, C, N, 1)
|
||
|
|
||
|
|
||
|
class Stem(nn.Module):
|
||
|
""" Image to Visual Embedding
|
||
|
Overlap: https://arxiv.org/pdf/2106.13797.pdf
|
||
|
"""
|
||
|
def __init__(self, img_size=224, in_dim=3, out_dim=768, act='relu'):
|
||
|
super().__init__()
|
||
|
self.convs = nn.Sequential(
|
||
|
nn.Conv2d(in_dim, out_dim//2, 3, stride=2, padding=1),
|
||
|
nn.BatchNorm2d(out_dim//2),
|
||
|
act_layer(act),
|
||
|
nn.Conv2d(out_dim//2, out_dim, 3, stride=2, padding=1),
|
||
|
nn.BatchNorm2d(out_dim),
|
||
|
act_layer(act),
|
||
|
nn.Conv2d(out_dim, out_dim, 3, stride=1, padding=1),
|
||
|
nn.BatchNorm2d(out_dim),
|
||
|
)
|
||
|
|
||
|
def forward(self, x):
|
||
|
x = self.convs(x)
|
||
|
return x
|
||
|
|
||
|
|
||
|
class Downsample(nn.Module):
|
||
|
""" Convolution-based downsample
|
||
|
"""
|
||
|
def __init__(self, in_dim=3, out_dim=768):
|
||
|
super().__init__()
|
||
|
self.conv = nn.Sequential(
|
||
|
nn.Conv2d(in_dim, out_dim, 3, stride=2, padding=1),
|
||
|
nn.BatchNorm2d(out_dim),
|
||
|
)
|
||
|
|
||
|
def forward(self, x):
|
||
|
x = self.conv(x)
|
||
|
return x
|
||
|
|
||
|
|
||
|
class DeepGCN(torch.nn.Module):
|
||
|
def __init__(self, opt):
|
||
|
super(DeepGCN, self).__init__()
|
||
|
print(opt)
|
||
|
k = opt.k
|
||
|
act = opt.act
|
||
|
norm = opt.norm
|
||
|
bias = opt.bias
|
||
|
epsilon = opt.epsilon
|
||
|
stochastic = opt.use_stochastic
|
||
|
conv = opt.conv
|
||
|
emb_dims = opt.emb_dims
|
||
|
drop_path = opt.drop_path
|
||
|
|
||
|
blocks = opt.blocks
|
||
|
self.n_blocks = sum(blocks)
|
||
|
channels = opt.channels
|
||
|
reduce_ratios = [4, 2, 1, 1]
|
||
|
dpr = [x.item() for x in torch.linspace(0, drop_path, self.n_blocks)] # stochastic depth decay rule
|
||
|
num_knn = [int(x.item()) for x in torch.linspace(k, k, self.n_blocks)] # number of knn's k
|
||
|
max_dilation = 49 // max(num_knn)
|
||
|
|
||
|
self.stem = Stem(out_dim=channels[0], act=act)
|
||
|
self.pos_embed = nn.Parameter(torch.zeros(1, channels[0], 224//4, 224//4))
|
||
|
HW = 224 // 4 * 224 // 4
|
||
|
|
||
|
self.backbone = nn.ModuleList([])
|
||
|
idx = 0
|
||
|
for i in range(len(blocks)):
|
||
|
if i > 0:
|
||
|
# self.backbone.append(Seq(Downsample(channels[i-1], channels[i])))
|
||
|
self.backbone.append(Downsample(channels[i-1], channels[i]))
|
||
|
HW = HW // 4
|
||
|
for j in range(blocks[i]):
|
||
|
self.backbone += [
|
||
|
Seq(Grapher(channels[i], num_knn[idx], min(idx // 4 + 1, max_dilation), conv, act, norm,
|
||
|
bias, stochastic, epsilon, reduce_ratios[i], n=HW, drop_path=dpr[idx],
|
||
|
relative_pos=True),
|
||
|
FFN(channels[i], channels[i] * 4, act=act, drop_path=dpr[idx])
|
||
|
)]
|
||
|
idx += 1
|
||
|
self.backbone = Seq(*self.backbone)
|
||
|
|
||
|
# self.prediction = Seq(nn.Conv2d(channels[-1], 512, 1, bias=True),
|
||
|
# nn.BatchNorm2d(512),
|
||
|
# act_layer(act),
|
||
|
# nn.Dropout(opt.dropout),
|
||
|
# nn.Conv2d(512, opt.n_classes, 1, bias=True))
|
||
|
self.prediction = Seq(nn.Conv2d(channels[-1], 1024, 1, bias=True),
|
||
|
nn.BatchNorm2d(1024),
|
||
|
act_layer(act),
|
||
|
nn.Dropout(opt.dropout),
|
||
|
nn.Conv2d(1024, opt.n_classes, 1, bias=True))
|
||
|
self.model_init()
|
||
|
|
||
|
def model_init(self):
|
||
|
for m in self.modules():
|
||
|
if isinstance(m, torch.nn.Conv2d):
|
||
|
torch.nn.init.kaiming_normal_(m.weight)
|
||
|
m.weight.requires_grad = True
|
||
|
if m.bias is not None:
|
||
|
m.bias.data.zero_()
|
||
|
m.bias.requires_grad = True
|
||
|
|
||
|
def forward(self, inputs):
|
||
|
x = self.stem(inputs) + self.pos_embed
|
||
|
# print('x shape : ', x.shape)
|
||
|
B, C, H, W = x.shape
|
||
|
low_feature, mid_feature = None, None
|
||
|
for i in range(len(self.backbone)):
|
||
|
# print('x shape : ', x.shape)
|
||
|
x = self.backbone[i](x)
|
||
|
if i == 5:
|
||
|
low_feature = x
|
||
|
if i == 12:
|
||
|
mid_feature = x
|
||
|
# print('x shape befo adapt: ', x.shape)
|
||
|
x = F.adaptive_avg_pool2d(x, 1)
|
||
|
# print('x shape : ', x.shape)
|
||
|
# print('x shape after prediction : ', self.prediction(x).shape)
|
||
|
solo_feature = self.prediction(x).squeeze(-1).squeeze(-1)
|
||
|
return low_feature, mid_feature, solo_feature
|
||
|
|
||
|
|
||
|
def pvig_ti_224_gelu(pretrained=False, **kwargs):
|
||
|
class OptInit:
|
||
|
def __init__(self, num_classes=1000, drop_path_rate=0.0, **kwargs):
|
||
|
self.k = 9 # neighbor num (default:9)
|
||
|
self.conv = 'mr' # graph conv layer {edge, mr}
|
||
|
self.act = 'gelu' # activation layer {relu, prelu, leakyrelu, gelu, hswish}
|
||
|
self.norm = 'batch' # batch or instance normalization {batch, instance}
|
||
|
self.bias = True # bias of conv layer True or False
|
||
|
self.dropout = 0.0 # dropout rate
|
||
|
self.use_dilation = True # use dilated knn or not
|
||
|
self.epsilon = 0.2 # stochastic epsilon for gcn
|
||
|
self.use_stochastic = False # stochastic for gcn, True or False
|
||
|
self.drop_path = drop_path_rate
|
||
|
self.blocks = [2,2,6,2] # number of basic blocks in the backbone
|
||
|
self.channels = [48, 96, 240, 384] # number of channels of deep features
|
||
|
self.n_classes = num_classes # Dimension of out_channels
|
||
|
self.emb_dims = 1024 # Dimension of embeddings
|
||
|
|
||
|
opt = OptInit(**kwargs)
|
||
|
model = DeepGCN(opt)
|
||
|
model.default_cfg = default_cfgs['vig_224_gelu']
|
||
|
# 加载预训练模型
|
||
|
state_dict = torch.load('/home/wzm/GAC/model/pvig_ti_78.5.pth.tar')
|
||
|
model.load_state_dict(state_dict, strict=False)
|
||
|
|
||
|
# model.backbone[2].add_module('GAT', GAT(GATopt(96, 1)))
|
||
|
# model.backbone[5].add_module('GAT', GAT(GATopt(240, 1)))
|
||
|
# model.backbone[12].add_module('GAT', GAT(GATopt(384, 1)))
|
||
|
model.prediction = Seq(nn.Conv2d(384, 512, 1, bias=True),
|
||
|
nn.BatchNorm2d(512),
|
||
|
act_layer('gelu'),
|
||
|
nn.Dropout(opt.dropout),
|
||
|
nn.Conv2d(512, 512, 1, bias=True))
|
||
|
# Freeze pre-train model
|
||
|
# for param in model.parameters():
|
||
|
# param.requires_grad = False
|
||
|
|
||
|
# # unfreeze prediction param
|
||
|
# for name, param in model.named_parameters():
|
||
|
# if "prediction" in name:
|
||
|
# param.requires_grad = True
|
||
|
|
||
|
return model
|