main
朱瑞 2024-07-04 17:03:29 +08:00
commit 4a2986bde5
30 changed files with 5342 additions and 0 deletions

140
.gitignore vendored Normal file
View File

@ -0,0 +1,140 @@
# ignore map, miou, datasets
map_out/
miou_out/
VOCdevkit/
datasets/
Medical_Datasets/
lfw/
logs/
model_data/
.temp_map_out/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

161
Dataset_Partition.py Normal file
View File

@ -0,0 +1,161 @@
import os
import random
import xml.etree.ElementTree as ET
import numpy as np
from utils.utils import get_classes
# --------------------------------------------------------------------------------------------------------------------------------#
# annotation_mode用于指定该文件运行时计算的内容
# annotation_mode为0代表整个标签处理过程包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
# annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
# annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
# --------------------------------------------------------------------------------------------------------------------------------#
annotation_mode = 0
# -------------------------------------------------------------------#
# 必须要修改用于生成2007_train.txt、2007_val.txt的目标信息
# 与训练和预测所用的classes_path一致即可
# 如果生成的2007_train.txt里面没有目标信息
# 那么就是因为classes没有设定正确
# 仅在annotation_mode为0和2的时候有效
# -------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt'
# --------------------------------------------------------------------------------------------------------------------------------#
# trainval_percent用于指定(训练集+验证集)与测试集的比例,默认情况下 (训练集+验证集):测试集 = 9:1
# train_percent用于指定(训练集+验证集)中训练集与验证集的比例,默认情况下 训练集:验证集 = 9:1
# 仅在annotation_mode为0和1的时候有效
# --------------------------------------------------------------------------------------------------------------------------------#
trainval_percent = 0.9
train_percent = 0.9
# -------------------------------------------------------#
# 指向VOC数据集所在的文件夹
# 默认指向根目录下的VOC数据集
# -------------------------------------------------------#
VOCdevkit_path = 'VOCdevkit'
VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
classes, _ = get_classes(classes_path)
# -------------------------------------------------------#
# 统计目标数量
# -------------------------------------------------------#
photo_nums = np.zeros(len(VOCdevkit_sets)) # 生成train的数目val的数目
nums = np.zeros(len(classes))
def convert_annotation(year, image_id, list_file):
in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml' % (year, image_id)), encoding='utf-8')
tree = ET.parse(in_file)
root = tree.getroot()
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult') != None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)),
int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
nums[classes.index(cls)] = nums[classes.index(cls)] + 1 # 统计各个类别的个数
if __name__ == "__main__":
random.seed(0)
if " " in os.path.abspath(VOCdevkit_path):
raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格,否则会影响正常的模型训练,请注意修改。")
if annotation_mode == 0 or annotation_mode == 1:
print("Generate txt in ImageSets.")
xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets')
temp_xml = os.listdir(xmlfilepath)
total_xml = []
for xml in temp_xml:
if xml.endswith(".xml"):
total_xml.append(xml)
num = len(total_xml)
list = range(num)
tv = int(num * trainval_percent) # 训练、验证集 总数
tr = int(tv * train_percent) # 训练、验证集中 训练集的总数
trainval = random.sample(list, tv) # 在总数里采样
train = random.sample(trainval, tr) # 在tv中采样tr
print("train and val size", tv)
print("train size", tr)
ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
for i in list:
name = total_xml[i][:-4] + '\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest.close()
print("Generate txt in ImageSets done.")
if annotation_mode == 0 or annotation_mode == 2:
print("Generate 2007_train.txt and 2007_val.txt for train.")
type_index = 0
for year, image_set in VOCdevkit_sets:
image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt' % (year, image_set)),
encoding='utf-8').read().strip().split()
list_file = open('%s_%s.txt' % (year, image_set), 'w', encoding='utf-8')
for image_id in image_ids:
list_file.write(
'%s/VOC%s/JPEGImages/%s.jpg' % (os.path.abspath(VOCdevkit_path), year, image_id)) # 文件名字是拼出来的
convert_annotation(year, image_id, list_file)
list_file.write('\n')
photo_nums[type_index] = len(image_ids)
type_index += 1
list_file.close()
print("Generate 2007_train.txt and 2007_val.txt for train done.")
def printTable(List1, List2):
for i in range(len(List1[0])):
print("|", end=' ')
for j in range(len(List1)):
print(List1[j][i].rjust(int(List2[j])), end=' ')
print("|", end=' ')
print()
str_nums = [str(int(x)) for x in nums]
tableData = [
classes, str_nums
]
colWidths = [0] * len(tableData)
len1 = 0
for i in range(len(tableData)):
for j in range(len(tableData[i])):
if len(tableData[i][j]) > colWidths[i]:
colWidths[i] = len(tableData[i][j])
printTable(tableData, colWidths)
if photo_nums[0] <= 500:
print("训练集数量小于500属于较小的数据量请注意设置较大的训练世代Epoch以满足足够的梯度下降次数Step")
if np.sum(nums) == 0:
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("(重要的事情说三遍)。")

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 JiaQi Xu
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

46
ad_train.py Normal file
View File

@ -0,0 +1,46 @@
from torch import optim
class BaseConfig(object):
"""
Default parameters for all config files.
"""
def __init__(self):
"""
Set the defaults.
"""
self.img_dir = "inria/Train/pos"
self.lab_dir = "inria/Train/pos/yolo-labels"
self.cfgfile = "cfg/yolo.cfg"
self.weightfile = "weights/yolo.weights"
self.printfile = "non_printability/30values.txt"
self.patch_size = 300
self.start_learning_rate = 0.03
self.patch_name = 'base'
self.scheduler_factory = lambda x: optim.lr_scheduler.ReduceLROnPlateau(x, 'min', patience=50)
self.max_tv = 0
self.batch_size = 20
self.loss_target = lambda obj, cls: obj * cls
class ReproducePaperObj(BaseConfig):
"""
Reproduce the results from the paper: Generate a patch that minimises object score.
"""
def __init__(self):
super().__init__()
self.batch_size = 8
self.patch_size = 300
self.patch_name = 'ObjectOnlyPaper'
self.max_tv = 0.165
self.loss_target = lambda obj, cls: obj

138
get_map.py Normal file
View File

@ -0,0 +1,138 @@
import os
import xml.etree.ElementTree as ET
from PIL import Image
from tqdm import tqdm
from utils.utils import get_classes
from utils.utils_map import get_coco_map, get_map
from yolo import YOLO
if __name__ == "__main__":
'''
Recall和Precision不像AP是一个面积的概念因此在门限值Confidence不同时网络的Recall和Precision值是不同的
默认情况下本代码计算的Recall和Precision代表的是当门限值Confidence为0.5所对应的Recall和Precision值
受到mAP计算原理的限制网络在计算mAP时需要获得近乎所有的预测框这样才可以计算不同门限条件下的Recall和Precision值
因此本代码获得的map_out/detection-results/里面的txt的框的数量一般会比直接predict多一些目的是列出所有可能的预测框
'''
# ------------------------------------------------------------------------------------------------------------------#
# map_mode用于指定该文件运行时计算的内容
# map_mode为0代表整个map计算流程包括获得预测结果、获得真实框、计算VOC_map。
# map_mode为1代表仅仅获得预测结果。
# map_mode为2代表仅仅获得真实框。
# map_mode为3代表仅仅计算VOC_map。
# map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行
# -------------------------------------------------------------------------------------------------------------------#
map_mode = 0
# --------------------------------------------------------------------------------------#
# 此处的classes_path用于指定需要测量VOC_map的类别
# 一般情况下与训练和预测所用的classes_path一致即可
# --------------------------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt'
# --------------------------------------------------------------------------------------#
# MINOVERLAP用于指定想要获得的mAP0.xmAP0.x的意义是什么请同学们百度一下。
# 比如计算mAP0.75可以设定MINOVERLAP = 0.75。
#
# 当某一预测框与真实框重合度大于MINOVERLAP时该预测框被认为是正样本否则为负样本。
# 因此MINOVERLAP的值越大预测框要预测的越准确才能被认为是正样本此时算出来的mAP值越低
# --------------------------------------------------------------------------------------#
MINOVERLAP = 0.5
# --------------------------------------------------------------------------------------#
# 受到mAP计算原理的限制网络在计算mAP时需要获得近乎所有的预测框这样才可以计算mAP
# 因此confidence的值应当设置的尽量小进而获得全部可能的预测框。
#
# 该值一般不调整。因为计算mAP需要获得近乎所有的预测框此处的confidence不能随便更改。
# 想要获得不同门限值下的Recall和Precision值请修改下方的score_threhold。
# --------------------------------------------------------------------------------------#
confidence = 0.001
# --------------------------------------------------------------------------------------#
# 预测时使用到的非极大抑制值的大小,越大表示非极大抑制越不严格。
#
# 该值一般不调整。
# --------------------------------------------------------------------------------------#
nms_iou = 0.5
# ---------------------------------------------------------------------------------------------------------------#
# Recall和Precision不像AP是一个面积的概念因此在门限值不同时网络的Recall和Precision值是不同的。
#
# 默认情况下本代码计算的Recall和Precision代表的是当门限值为0.5此处定义为score_threhold时所对应的Recall和Precision值。
# 因为计算mAP需要获得近乎所有的预测框上面定义的confidence不能随便更改。
# 这里专门定义一个score_threhold用于代表门限值进而在计算mAP时找到门限值对应的Recall和Precision值。
# ---------------------------------------------------------------------------------------------------------------#
score_threhold = 0.5
# -------------------------------------------------------#
# map_vis用于指定是否开启VOC_map计算的可视化
# -------------------------------------------------------#
map_vis = False
# -------------------------------------------------------#
# 指向VOC数据集所在的文件夹
# 默认指向根目录下的VOC数据集
# -------------------------------------------------------#
VOCdevkit_path = 'VOCdevkit'
# -------------------------------------------------------#
# 结果输出的文件夹默认为map_out
# -------------------------------------------------------#
map_out_path = 'map_out'
image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split()
if not os.path.exists(map_out_path):
os.makedirs(map_out_path)
if not os.path.exists(os.path.join(map_out_path, 'ground-truth')):
os.makedirs(os.path.join(map_out_path, 'ground-truth'))
if not os.path.exists(os.path.join(map_out_path, 'detection-results')):
os.makedirs(os.path.join(map_out_path, 'detection-results'))
if not os.path.exists(os.path.join(map_out_path, 'images-optional')):
os.makedirs(os.path.join(map_out_path, 'images-optional'))
class_names, _ = get_classes(classes_path)
if map_mode == 0 or map_mode == 1:
print("Load model.")
yolo = YOLO(confidence=confidence, nms_iou=nms_iou)
print("Load model done.")
print("Get predict result.")
for image_id in tqdm(image_ids):
image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/" + image_id + ".jpg")
image = Image.open(image_path)
if map_vis:
image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg"))
yolo.get_map_txt(image_id, image, class_names, map_out_path)
print("Get predict result done.")
if map_mode == 0 or map_mode == 2:
print("Get ground truth result.")
for image_id in tqdm(image_ids):
with open(os.path.join(map_out_path, "ground-truth/" + image_id + ".txt"), "w") as new_f:
root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/" + image_id + ".xml")).getroot()
for obj in root.findall('object'):
difficult_flag = False
if obj.find('difficult') != None:
difficult = obj.find('difficult').text
if int(difficult) == 1:
difficult_flag = True
obj_name = obj.find('name').text
if obj_name not in class_names:
continue
bndbox = obj.find('bndbox')
left = bndbox.find('xmin').text
top = bndbox.find('ymin').text
right = bndbox.find('xmax').text
bottom = bndbox.find('ymax').text
if difficult_flag:
new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom))
else:
new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
print("Get ground truth result done.")
if map_mode == 0 or map_mode == 3:
print("Get map.")
get_map(MINOVERLAP, True, score_threhold=score_threhold, path=map_out_path)
print("Get map done.")
if map_mode == 4:
print("Get map.")
get_coco_map(class_names=class_names, path=map_out_path)
print("Get map done.")

167
kmeans_for_anchors.py Normal file
View File

@ -0,0 +1,167 @@
# -------------------------------------------------------------------------------------------------------#
# kmeans虽然会对数据集中的框进行聚类但是很多数据集由于框的大小相近聚类出来的9个框相差不大
# 这样的框反而不利于模型的训练。因为不同的特征层适合不同大小的先验框shape越小的特征层适合越大的先验框
# 原始网络的先验框已经按大中小比例分配好了,不进行聚类也会有非常好的效果。
# -------------------------------------------------------------------------------------------------------#
import glob
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
def cas_iou(box, cluster):
x = np.minimum(cluster[:, 0], box[0])
y = np.minimum(cluster[:, 1], box[1])
intersection = x * y
area1 = box[0] * box[1]
area2 = cluster[:, 0] * cluster[:, 1]
iou = intersection / (area1 + area2 - intersection)
return iou
def avg_iou(box, cluster):
return np.mean([np.max(cas_iou(box[i], cluster)) for i in range(box.shape[0])])
def kmeans(box, k):
# -------------------------------------------------------------#
# 取出一共有多少框
# -------------------------------------------------------------#
row = box.shape[0]
# -------------------------------------------------------------#
# 每个框各个点的位置
# -------------------------------------------------------------#
distance = np.empty((row, k))
# -------------------------------------------------------------#
# 最后的聚类位置
# -------------------------------------------------------------#
last_clu = np.zeros((row,))
np.random.seed()
# -------------------------------------------------------------#
# 随机选5个当聚类中心
# -------------------------------------------------------------#
cluster = box[np.random.choice(row, k, replace=False)]
iter = 0
while True:
# -------------------------------------------------------------#
# 计算当前框和先验框的宽高比例
# -------------------------------------------------------------#
for i in range(row):
distance[i] = 1 - cas_iou(box[i], cluster)
# -------------------------------------------------------------#
# 取出最小点
# -------------------------------------------------------------#
near = np.argmin(distance, axis=1)
if (last_clu == near).all():
break
# -------------------------------------------------------------#
# 求每一个类的中位点
# -------------------------------------------------------------#
for j in range(k):
cluster[j] = np.median(
box[near == j], axis=0)
last_clu = near
if iter % 5 == 0:
print('iter: {:d}. avg_iou:{:.2f}'.format(iter, avg_iou(box, cluster)))
iter += 1
return cluster, near
def load_data(path):
data = []
# -------------------------------------------------------------#
# 对于每一个xml都寻找box
# -------------------------------------------------------------#
for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
tree = ET.parse(xml_file)
height = int(tree.findtext('./size/height'))
width = int(tree.findtext('./size/width'))
if height <= 0 or width <= 0:
continue
# -------------------------------------------------------------#
# 对于每一个目标都获得它的宽高
# -------------------------------------------------------------#
for obj in tree.iter('object'):
xmin = int(float(obj.findtext('bndbox/xmin'))) / width
ymin = int(float(obj.findtext('bndbox/ymin'))) / height
xmax = int(float(obj.findtext('bndbox/xmax'))) / width
ymax = int(float(obj.findtext('bndbox/ymax'))) / height
xmin = np.float64(xmin)
ymin = np.float64(ymin)
xmax = np.float64(xmax)
ymax = np.float64(ymax)
# 得到宽高
data.append([xmax - xmin, ymax - ymin])
return np.array(data)
if __name__ == '__main__':
np.random.seed(0)
# -------------------------------------------------------------#
# 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
# 会生成yolo_anchors.txt
# -------------------------------------------------------------#
input_shape = [416, 416]
anchors_num = 9
# -------------------------------------------------------------#
# 载入数据集可以使用VOC的xml
# -------------------------------------------------------------#
path = 'VOCdevkit/VOC2007/Annotations'
# -------------------------------------------------------------#
# 载入所有的xml
# 存储格式为转化为比例后的width,height
# -------------------------------------------------------------#
print('Load xmls.')
data = load_data(path)
print('Load xmls done.')
# -------------------------------------------------------------#
# 使用k聚类算法
# -------------------------------------------------------------#
print('K-means boxes.')
cluster, near = kmeans(data, anchors_num)
print('K-means boxes done.')
data = data * np.array([input_shape[1], input_shape[0]])
cluster = cluster * np.array([input_shape[1], input_shape[0]])
# -------------------------------------------------------------#
# 绘图
# -------------------------------------------------------------#
for j in range(anchors_num):
plt.scatter(data[near == j][:, 0], data[near == j][:, 1])
plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
plt.savefig("kmeans_for_anchors.jpg")
plt.show()
print('Save kmeans_for_anchors.jpg in root dir.')
cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
print('avg_ratio:{:.2f}'.format(avg_iou(data, cluster)))
print(cluster)
f = open("yolo_anchors.txt", 'w')
row = np.shape(cluster)[0]
for i in range(row):
if i == 0:
x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
else:
x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
f.write(x_y)
f.close()

531
load_data.py Normal file
View File

@ -0,0 +1,531 @@
import fnmatch
import math
import os
import sys
import time
from operator import itemgetter
import gc
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
# from darknet import Darknet
from median_pool import MedianPool2d
# print('starting test read')
# im = Image.open('data/horse.jpg').convert('RGB')
# print('img read!')
class MaxProbExtractor(nn.Module):
"""MaxProbExtractor: extracts max class probability for class from YOLO output.
Module providing the functionality necessary to extract the max class probability for one class from YOLO output.
"""
def __init__(self, cls_id, num_cls, config):
super(MaxProbExtractor, self).__init__()
self.cls_id = cls_id
self.num_cls = num_cls
self.config = config
self.anchor_num = 3
def forward(self, YOLOoutput):
# get values neccesary for transformation
if YOLOoutput.dim() == 3:
YOLOoutput = YOLOoutput.unsqueeze(0)
batch = YOLOoutput.size(0)
assert (YOLOoutput.size(1) == (5 + self.num_cls) * self.anchor_num)
h = YOLOoutput.size(2)
w = YOLOoutput.size(3)
# transform the output tensor from [batch, 425, 19, 19] to [batch, 80, 1805]
output = YOLOoutput.view(batch, self.anchor_num, 5 + self.num_cls, h * w) # [batch, 5, 85, 361]
output = output.transpose(1, 2).contiguous() # [batch, 85, 5, 361]
output = output.view(batch, 5 + self.num_cls, self.anchor_num * h * w) # [batch, 85, 1805]
output_objectness = torch.sigmoid(output[:, 4, :]) # [batch, 1805] # 是否有物体
output = output[:, 5:5 + self.num_cls, :] # [batch, 80, 1805]
# perform softmax to normalize probabilities for object classes to [0,1]
normal_confs = torch.nn.Softmax(dim=1)(output) # 物体类别
# we only care for probabilities of the class of interest (person)
confs_for_class = normal_confs[:, self.cls_id, :] # 类别 序号对应的为人
confs_if_object = output_objectness # confs_for_class * output_objectness
confs_if_object = confs_for_class * output_objectness
confs_if_object = self.config.loss_target(output_objectness, confs_for_class)
# find the max probability for person
max_conf, max_conf_idx = torch.max(confs_if_object, dim=1)
return max_conf
class NPSCalculator(nn.Module):
"""NMSCalculator: calculates the non-printability score of a patch.
Module providing the functionality necessary to calculate the non-printability score (NMS) of an adversarial patch.
"""
def __init__(self, printability_file, patch_side):
super(NPSCalculator, self).__init__()
self.printability_array = nn.Parameter(self.get_printability_array(printability_file, patch_side),
requires_grad=False)
def forward(self, adv_patch):
# calculate euclidian distance between colors in patch and colors in printability_array
# square root of sum of squared difference
color_dist = (adv_patch - self.printability_array + 0.000001)
color_dist = color_dist ** 2
color_dist = torch.sum(color_dist, 1) + 0.000001
color_dist = torch.sqrt(color_dist)
# only work with the min distance
color_dist_prod = torch.min(color_dist, 0)[0] # test: change prod for min (find distance to closest color)
# calculate the nps by summing over all pixels
nps_score = torch.sum(color_dist_prod, 0)
nps_score = torch.sum(nps_score, 0)
return nps_score / torch.numel(adv_patch)
def get_printability_array(self, printability_file, side):
printability_list = []
# read in printability triplets and put them in a list
with open(printability_file) as f:
for line in f:
printability_list.append(line.split(","))
printability_array = []
for printability_triplet in printability_list:
printability_imgs = []
red, green, blue = printability_triplet
printability_imgs.append(np.full((side, side), red))
printability_imgs.append(np.full((side, side), green))
printability_imgs.append(np.full((side, side), blue))
printability_array.append(printability_imgs)
printability_array = np.asarray(printability_array)
printability_array = np.float32(printability_array)
pa = torch.from_numpy(printability_array)
return pa
class TotalVariation(nn.Module):
"""TotalVariation: calculates the total variation of a patch.
Module providing the functionality necessary to calculate the total Variation (TV) of an adversarial patch.
TotalVariation计算补丁的总变化
该模块提供了计算对抗性补丁的总变化 (TV) 所需的功能
"""
def __init__(self):
super(TotalVariation, self).__init__()
def forward(self, adv_patch):
# bereken de total variation van de adv_patch
tvcomp1 = torch.sum(torch.abs(adv_patch[:, :, 1:] - adv_patch[:, :, :-1] + 0.000001), 0)
tvcomp1 = torch.sum(torch.sum(tvcomp1, 0), 0)
tvcomp2 = torch.sum(torch.abs(adv_patch[:, 1:, :] - adv_patch[:, :-1, :] + 0.000001), 0)
tvcomp2 = torch.sum(torch.sum(tvcomp2, 0), 0)
tv = tvcomp1 + tvcomp2
return tv / torch.numel(adv_patch)
class PatchTransformer(nn.Module):
"""PatchTransformer: transforms batch of patches
Module providing the functionality necessary to transform a batch of patches, randomly adjusting brightness and
contrast, adding random amount of noise, and rotating randomly. Resizes-patches according to as size based on the
batch of labels, and pads them to the dimension of an image.
变换一批补丁随机调整亮度和对比度添加随机数量的噪声随机旋转 根据标签批次的大小调整补丁大小并将它们填充到图像的尺寸中
"""
def __init__(self):
super(PatchTransformer, self).__init__()
self.min_contrast = 0.8
self.max_contrast = 1.2
self.min_brightness = -0.1
self.max_brightness = 0.1
self.noise_factor = 0.10
self.minangle = -20 / 180 * math.pi
self.maxangle = 20 / 180 * math.pi
self.medianpooler = MedianPool2d(7, same=True) # 中值池化
'''
kernel = torch.cuda.FloatTensor([[0.003765, 0.015019, 0.023792, 0.015019, 0.003765],
[0.015019, 0.059912, 0.094907, 0.059912, 0.015019],
[0.023792, 0.094907, 0.150342, 0.094907, 0.023792],
[0.015019, 0.059912, 0.094907, 0.059912, 0.015019],
[0.003765, 0.015019, 0.023792, 0.015019, 0.003765]])
self.kernel = kernel.unsqueeze(0).unsqueeze(0).expand(3,3,-1,-1)
'''
def forward(self, adv_patch, lab_batch, img_size, do_rotate=True, rand_loc=True):
# adv_patch = F.conv2d(adv_patch.unsqueeze(0),self.kernel,padding=(2,2))
adv_patch = self.medianpooler(adv_patch.unsqueeze(0))
# Determine size of padding
pad = (img_size - adv_patch.size(-1)) / 2
# Make a batch of patches
adv_patch = adv_patch.unsqueeze(0) # .unsqueeze(0) # 这里又扩大一维变成5维 1, 1, 3, 300, 300
adv_batch = adv_patch.expand(lab_batch.size(0), lab_batch.size(1), -1, -1, -1) # adv_batch 不是adv_patch 8, 14, 3, 300, 300
batch_size = torch.Size((lab_batch.size(0), lab_batch.size(1))) # 8, 14
# Contrast, brightness and noise transforms
# Create random contrast tensor
contrast = torch.cuda.FloatTensor(batch_size).uniform_(self.min_contrast, self.max_contrast)
contrast = contrast.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
contrast = contrast.expand(-1, -1, adv_batch.size(-3), adv_batch.size(-2), adv_batch.size(-1))
contrast = contrast.cuda()
# Create random brightness tensor
brightness = torch.cuda.FloatTensor(batch_size).uniform_(self.min_brightness, self.max_brightness)
brightness = brightness.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
brightness = brightness.expand(-1, -1, adv_batch.size(-3), adv_batch.size(-2), adv_batch.size(-1))
brightness = brightness.cuda()
# Create random noise tensor
noise = torch.cuda.FloatTensor(adv_batch.size()).uniform_(-1, 1) * self.noise_factor
# Apply contrast/brightness/noise, clamp
adv_batch = adv_batch * contrast + brightness + noise
adv_batch = torch.clamp(adv_batch, 0.000001, 0.99999) # 限制到0到1之间
# Where the label class_id is 1 we don't want a patch (padding) --> fill mask with zero's
cls_ids = torch.narrow(lab_batch, 2, 0, 1) # torch.narrow(input,dim,start,length) 从dim开始返回共享内存的数据start到start+length-1
cls_mask = cls_ids.expand(-1, -1, 3) # 接上,这里取出 lab_batch的代表id那列相当于现在的lab_batch[..., 0]
cls_mask = cls_mask.unsqueeze(-1)
cls_mask = cls_mask.expand(-1, -1, -1, adv_batch.size(3))
cls_mask = cls_mask.unsqueeze(-1)
cls_mask = cls_mask.expand(-1, -1, -1, -1, adv_batch.size(4)) # cls_mask 的大小是 8, 14, 3, 300, 300 数据是类别
msk_batch = torch.cuda.FloatTensor(cls_mask.size()).fill_(1) - cls_mask # 这里取出有人所对应的msk
# Pad patch and mask to image dimensions
mypad = nn.ConstantPad2d((int(pad + 0.5), int(pad), int(pad + 0.5), int(pad)), 0) # (padding_left、padding_right、padding_top、padding_bottom) 填充0
adv_batch = mypad(adv_batch) # 用0填充到416
msk_batch = mypad(msk_batch)
# Rotation and rescaling transforms
anglesize = (lab_batch.size(0) * lab_batch.size(1)) # 这里是旋转的数量
if do_rotate:
angle = torch.cuda.FloatTensor(anglesize).uniform_(self.minangle, self.maxangle)
else:
angle = torch.cuda.FloatTensor(anglesize).fill_(0)
# Resizes and rotates
current_patch_size = adv_patch.size(-1)
lab_batch_scaled = torch.cuda.FloatTensor(lab_batch.size()).fill_(0) # lab_batch_scaled是在原图上的尺寸
lab_batch_scaled[:, :, 1] = lab_batch[:, :, 1] * img_size
lab_batch_scaled[:, :, 2] = lab_batch[:, :, 2] * img_size
lab_batch_scaled[:, :, 3] = lab_batch[:, :, 3] * img_size
lab_batch_scaled[:, :, 4] = lab_batch[:, :, 4] * img_size
target_size = torch.sqrt(
((lab_batch_scaled[:, :, 3].mul(0.2)) ** 2) + ((lab_batch_scaled[:, :, 4].mul(0.2)) ** 2))
target_x = lab_batch[:, :, 1].view(np.prod(batch_size))
target_y = lab_batch[:, :, 2].view(np.prod(batch_size))
targetoff_x = lab_batch[:, :, 3].view(np.prod(batch_size))
targetoff_y = lab_batch[:, :, 4].view(np.prod(batch_size))
if (rand_loc):
off_x = targetoff_x * (torch.cuda.FloatTensor(targetoff_x.size()).uniform_(-0.4, 0.4))
target_x = target_x + off_x
off_y = targetoff_y * (torch.cuda.FloatTensor(targetoff_y.size()).uniform_(-0.4, 0.4))
target_y = target_y + off_y
target_y = target_y - 0.05
scale = target_size / current_patch_size # 原图相对于补丁大小的缩放因子?
scale = scale.view(anglesize)
s = adv_batch.size()
adv_batch = adv_batch.view(s[0] * s[1], s[2], s[3], s[4])
msk_batch = msk_batch.view(s[0] * s[1], s[2], s[3], s[4])
tx = (-target_x + 0.5) * 2
ty = (-target_y + 0.5) * 2
sin = torch.sin(angle)
cos = torch.cos(angle)
# Theta = rotation,rescale matrix
theta = torch.cuda.FloatTensor(anglesize, 2, 3).fill_(0)
theta[:, 0, 0] = cos / scale
theta[:, 0, 1] = sin / scale
theta[:, 0, 2] = tx * cos / scale + ty * sin / scale
theta[:, 1, 0] = -sin / scale
theta[:, 1, 1] = cos / scale
theta[:, 1, 2] = -tx * sin / scale + ty * cos / scale
b_sh = adv_batch.shape
grid = F.affine_grid(theta, adv_batch.shape)
adv_batch_t = F.grid_sample(adv_batch, grid)
msk_batch_t = F.grid_sample(msk_batch, grid)
'''
# Theta2 = translation matrix
theta2 = torch.cuda.FloatTensor(anglesize, 2, 3).fill_(0)
theta2[:, 0, 0] = 1
theta2[:, 0, 1] = 0
theta2[:, 0, 2] = (-target_x + 0.5) * 2
theta2[:, 1, 0] = 0
theta2[:, 1, 1] = 1
theta2[:, 1, 2] = (-target_y + 0.5) * 2
grid2 = F.affine_grid(theta2, adv_batch.shape)
adv_batch_t = F.grid_sample(adv_batch_t, grid2)
msk_batch_t = F.grid_sample(msk_batch_t, grid2)
'''
adv_batch_t = adv_batch_t.view(s[0], s[1], s[2], s[3], s[4])
msk_batch_t = msk_batch_t.view(s[0], s[1], s[2], s[3], s[4])
adv_batch_t = torch.clamp(adv_batch_t, 0.000001, 0.999999)
# img = msk_batch_t[0, 0, :, :, :].detach().cpu()
# img = transforms.ToPILImage()(img)
# img.show()
# exit()
return adv_batch_t * msk_batch_t
class PatchApplier(nn.Module):
"""PatchApplier: applies adversarial patches to images.
Module providing the functionality necessary to apply a patch to all detections in all images in the batch.
PatchApplier对图像应用对抗补丁
"""
def __init__(self):
super(PatchApplier, self).__init__()
def forward(self, img_batch, adv_batch):
advs = torch.unbind(adv_batch, 1) # 沿1维解开
for adv in advs:
img_batch = torch.where((adv == 0), img_batch, adv) # 对图像相应的坐标位置替换其像素?好像还没到图像的环节
return img_batch
'''
class PatchGenerator(nn.Module):
"""PatchGenerator: network module that generates adversarial patches.
Module representing the neural network that will generate adversarial patches.
"""
def __init__(self, cfgfile, weightfile, img_dir, lab_dir):
super(PatchGenerator, self).__init__()
self.yolo = Darknet(cfgfile).load_weights(weightfile)
self.dataloader = torch.utils.data.DataLoader(InriaDataset(img_dir, lab_dir, shuffle=True),
batch_size=5,
shuffle=True)
self.patchapplier = PatchApplier()
self.nmscalculator = NMSCalculator()
self.totalvariation = TotalVariation()
def forward(self, *input):
pass
'''
class InriaDataset(Dataset):
"""InriaDataset: representation of the INRIA person dataset.
Internal representation of the commonly used INRIA person dataset.
Available at: http://pascal.inrialpes.fr/data/human/
Attributes:
len: An integer number of elements in the
img_dir: Directory containing the images of the INRIA dataset.
lab_dir: Directory containing the labels of the INRIA dataset.
img_names: List of all image file names in img_dir.
shuffle: Whether or not to shuffle the dataset.
"""
def __init__(self, img_dir, lab_dir, max_lab, imgsize, shuffle=True):
n_png_images = len(fnmatch.filter(os.listdir(img_dir), '*.png')) # 614 fnmatch.filter返回一个list
n_jpg_images = len(fnmatch.filter(os.listdir(img_dir), '*.jpg')) # 0
n_images = n_png_images + n_jpg_images # 图像的总数
n_labels = len(fnmatch.filter(os.listdir(lab_dir), '*.txt'))
assert n_images == n_labels, "Number of images and number of labels don't match"
self.len = n_images
self.img_dir = img_dir
self.lab_dir = lab_dir
self.imgsize = imgsize
self.img_names = fnmatch.filter(os.listdir(img_dir), '*.png') + fnmatch.filter(os.listdir(img_dir), '*.jpg')
self.shuffle = shuffle
self.img_paths = []
for img_name in self.img_names:
self.img_paths.append(os.path.join(self.img_dir, img_name))
self.lab_paths = []
for img_name in self.img_names:
lab_path = os.path.join(self.lab_dir, img_name).replace('.jpg', '.txt').replace('.png', '.txt')
self.lab_paths.append(lab_path)
self.max_n_labels = max_lab # label的长度
def __len__(self):
return self.len
def __getitem__(self, idx):
assert idx <= len(self), 'index range error'
img_path = os.path.join(self.img_dir, self.img_names[idx])
lab_path = os.path.join(self.lab_dir, self.img_names[idx]).replace('.jpg', '.txt').replace('.png', '.txt')
image = Image.open(img_path).convert('RGB')
if os.path.getsize(lab_path): # check to see if label file contains data.
label = np.loadtxt(lab_path)
else:
label = np.ones([5])
label = torch.from_numpy(label).float()
if label.dim() == 1:
label = label.unsqueeze(0)
image, label = self.pad_and_scale(image, label)
transform = transforms.ToTensor()
image = transform(image)
label = self.pad_lab(label)
# print("image size :", image.shape)
# print("label size :", label.shape)
return image, label
def pad_and_scale(self, img, lab):
"""
Args:
img:
Returns:
"""
w, h = img.size
if w == h:
padded_img = img
else:
dim_to_pad = 1 if w < h else 2
if dim_to_pad == 1:
padding = (h - w) / 2
padded_img = Image.new('RGB', (h, h), color=(127, 127, 127))
padded_img.paste(img, (int(padding), 0))
lab[:, [1]] = (lab[:, [1]] * w + padding) / h
lab[:, [3]] = (lab[:, [3]] * w / h)
else:
padding = (w - h) / 2
padded_img = Image.new('RGB', (w, w), color=(127, 127, 127))
padded_img.paste(img, (0, int(padding)))
lab[:, [2]] = (lab[:, [2]] * h + padding) / w
lab[:, [4]] = (lab[:, [4]] * h / w)
resize = transforms.Resize((self.imgsize, self.imgsize))
padded_img = resize(padded_img) # choose here
return padded_img, lab
def pad_lab(self, lab):
pad_size = self.max_n_labels - lab.shape[0]
if (pad_size > 0):
padded_lab = F.pad(lab, (0, 0, 0, pad_size), value=1) # (左边填充数, 右边填充数, 上边填充数, 下边填充数)
else:
padded_lab = lab
return padded_lab
if __name__ == '__main__':
if len(sys.argv) == 3:
img_dir = sys.argv[1]
lab_dir = sys.argv[2]
else:
print('Usage: ')
print(' python load_data.py img_dir lab_dir')
sys.exit()
test_loader = torch.utils.data.DataLoader(InriaDataset(img_dir, lab_dir, shuffle=True),
batch_size=3, shuffle=True)
cfgfile = "cfg/yolov2.cfg"
weightfile = "weights/yolov2.weights"
printfile = "non_printability/30values.txt"
patch_size = 400
darknet_model = Darknet(cfgfile)
darknet_model.load_weights(weightfile)
darknet_model = darknet_model.cuda()
patch_applier = PatchApplier().cuda()
patch_transformer = PatchTransformer().cuda()
prob_extractor = MaxProbExtractor(0, 80).cuda()
nms_calculator = NMSCalculator(printfile, patch_size)
total_variation = TotalVariation()
'''
img = Image.open('data/horse.jpg').convert('RGB')
img = img.resize((darknet_model.width, darknet_model.height))
width = img.width
height = img.height
img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous()
img = img.view(1, 3, height, width)
img = img.float().div(255.0)
img = torch.autograd.Variable(img)
output = darknet_model(img)
'''
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
tl0 = time.time()
tl1 = time.time()
for i_batch, (img_batch, lab_batch) in enumerate(test_loader):
tl1 = time.time()
print('time to fetch items: ', tl1 - tl0)
img_batch = img_batch.cuda()
lab_batch = lab_batch.cuda()
adv_patch = Image.open('data/horse.jpg').convert('RGB')
adv_patch = adv_patch.resize((patch_size, patch_size))
transform = transforms.ToTensor()
adv_patch = transform(adv_patch).cuda()
img_size = img_batch.size(-1)
print('transforming patches')
t0 = time.time()
adv_batch_t = patch_transformer.forward(adv_patch, lab_batch, img_size)
print('applying patches')
t1 = time.time()
img_batch = patch_applier.forward(img_batch, adv_batch_t)
img_batch = torch.autograd.Variable(img_batch)
img_batch = F.interpolate(img_batch, (darknet_model.height, darknet_model.width))
print('running patched images through model')
t2 = time.time()
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
try:
print(type(obj), obj.size())
except:
pass
except:
pass
print(torch.cuda.memory_allocated())
output = darknet_model(img_batch)
print('extracting max probs')
t3 = time.time()
max_prob = prob_extractor(output)
t4 = time.time()
nms = nms_calculator.forward(adv_patch)
tv = total_variation(adv_patch)
print('---------------------------------')
print(' patch transformation : %f' % (t1 - t0))
print(' patch application : %f' % (t2 - t1))
print(' darknet forward : %f' % (t3 - t2))
print(' probability extraction : %f' % (t4 - t3))
print('---------------------------------')
print(' total forward pass : %f' % (t4 - t0))
del img_batch, lab_batch, adv_patch, adv_batch_t, output, max_prob
torch.cuda.empty_cache()
tl0 = time.time()

50
median_pool.py Normal file
View File

@ -0,0 +1,50 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.utils import _pair, _quadruple
class MedianPool2d(nn.Module):
""" Median pool (usable as median filter when stride=1) module.
Args:
kernel_size: size of pooling kernel, int or 2-tuple
stride: pool stride, int or 2-tuple
padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
same: override padding and enforce same padding, boolean
"""
def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
super(MedianPool2d, self).__init__()
self.k = _pair(kernel_size)
self.stride = _pair(stride)
self.padding = _quadruple(padding) # convert to l, r, t, b
self.same = same
def _padding(self, x):
if self.same:
ih, iw = x.size()[2:]
if ih % self.stride[0] == 0:
ph = max(self.k[0] - self.stride[0], 0)
else:
ph = max(self.k[0] - (ih % self.stride[0]), 0)
if iw % self.stride[1] == 0:
pw = max(self.k[1] - self.stride[1], 0)
else:
pw = max(self.k[1] - (iw % self.stride[1]), 0)
pl = pw // 2
pr = pw - pl
pt = ph // 2
pb = ph - pt
padding = (pl, pr, pt, pb)
else:
padding = self.padding
return padding
def forward(self, x):
# using existing pytorch functions and tensor ops so that we get autograd,
# would likely be more efficient to implement from scratch at C/Cuda level
x = F.pad(x, self._padding(x), mode='reflect')
x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1])
x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0]
return x

1
nets/__init__.py Normal file
View File

@ -0,0 +1 @@
#

101
nets/darknet.py Normal file
View File

@ -0,0 +1,101 @@
import math
from collections import OrderedDict
import torch.nn as nn
# ---------------------------------------------------------------------#
# 残差结构
# 利用一个1x1卷积下降通道数然后利用一个3x3卷积提取特征并且上升通道数
# 最后接上一个残差边
# ---------------------------------------------------------------------#
class BasicBlock(nn.Module):
def __init__(self, inplanes, planes):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False) # 从大通道转化小通道。又从小通道转为大通道。
self.bn1 = nn.BatchNorm2d(planes[0])
self.relu1 = nn.LeakyReLU(0.1)
self.conv2 = nn.Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes[1])
self.relu2 = nn.LeakyReLU(0.1)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu1(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu2(out)
out += residual
return out
class DarkNet(nn.Module):
def __init__(self, layers):
super(DarkNet, self).__init__()
self.inplanes = 32 # 第一次卷积输出通道为32
# 416,416,3 -> 416,416,32
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu1 = nn.LeakyReLU(0.1)
# 416,416,32 -> 208,208,64
self.layer1 = self._make_layer([32, 64], layers[0]) # layers 中保存的是程序块重复的次数
# 208,208,64 -> 104,104,128
self.layer2 = self._make_layer([64, 128], layers[1])
# 104,104,128 -> 52,52,256
self.layer3 = self._make_layer([128, 256], layers[2])
# 52,52,256 -> 26,26,512
self.layer4 = self._make_layer([256, 512], layers[3])
# 26,26,512 -> 13,13,1024
self.layer5 = self._make_layer([512, 1024], layers[4])
self.layers_out_filters = [64, 128, 256, 512, 1024]
# 进行权值初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
# ---------------------------------------------------------------------#
# 在每一个layer里面首先利用一个步长为2的3x3卷积进行下采样
# 然后进行残差结构的堆叠
# ---------------------------------------------------------------------#
def _make_layer(self, planes, blocks):
layers = []
# 下采样步长为2卷积核大小为3 # 进入_make_layer先创建一层网络用于降采样然后再是多个重复的block
layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
layers.append(("ds_relu", nn.LeakyReLU(0.1)))
# 加入残差结构
self.inplanes = planes[1] # 保存这一层的输出通道,也是下一层的输入通道
for i in range(0, blocks):
layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes)))
return nn.Sequential(OrderedDict(layers))
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.layer1(x)
x = self.layer2(x)
out3 = self.layer3(x)
out4 = self.layer4(out3)
out5 = self.layer5(out4)
return out3, out4, out5
def darknet53():
model = DarkNet([1, 2, 8, 8, 4])
return model

111
nets/yolo.py Normal file
View File

@ -0,0 +1,111 @@
from collections import OrderedDict
import torch
import torch.nn as nn
from nets.darknet import darknet53
def conv2d(filter_in, filter_out, kernel_size):
pad = (kernel_size - 1) // 2 if kernel_size else 0
return nn.Sequential(OrderedDict([
("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
("bn", nn.BatchNorm2d(filter_out)),
("relu", nn.LeakyReLU(0.1)),
]))
# ------------------------------------------------------------------------#
# make_last_layers里面一共有七个卷积前五个用于提取特征。
# 后两个用于获得yolo网络的预测结果
# ------------------------------------------------------------------------#
def make_last_layers(filters_list, in_filters, out_filter):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 1), # 多次使用 1*1 的卷积调整通道,并进行通道方向的信息融合
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
)
return m
class YoloBody(nn.Module):
def __init__(self, anchors_mask, num_classes, pretrained=False):
super(YoloBody, self).__init__()
self.width = 416 # 临时加
self.height = 416 # 临时加
# ---------------------------------------------------#
# 生成darknet53的主干模型
# 获得三个有效特征层他们的shape分别是
# 52,52,256
# 26,26,512
# 13,13,1024
# ---------------------------------------------------#
self.backbone = darknet53()
if pretrained: # 载入预训练的权重darknet53是一个分类网络
self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth"))
# ---------------------------------------------------#
# out_filters : [64, 128, 256, 512, 1024]
# ---------------------------------------------------#
out_filters = self.backbone.layers_out_filters
# ------------------------------------------------------------------------#
# 计算yolo_head的输出通道数对于voc数据集而言
# final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
# ------------------------------------------------------------------------# len(anchors_mask[0]) 为 3
self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
self.last_layer1_conv = conv2d(512, 256, 1)
self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
self.last_layer2_conv = conv2d(256, 128, 1)
self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
def forward(self, x):
# ---------------------------------------------------#
# 获得三个有效特征层他们的shape分别是
# 52,52,25626,26,51213,13,1024
# ---------------------------------------------------#
x2, x1, x0 = self.backbone(x)
# ---------------------------------------------------#
# 第一个特征层
# out0 = (batch_size,255,13,13)
# ---------------------------------------------------#
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
out0_branch = self.last_layer0[:5](x0)
out0 = self.last_layer0[5:](out0_branch) # 8, 75, 13, 13 刚开始的2是测试用的不是正式数据
# 13,13,512 -> 13,13,256 -> 26,26,256
x1_in = self.last_layer1_conv(out0_branch) # 融合分支
x1_in = self.last_layer1_upsample(x1_in)
# 26,26,256 + 26,26,512 -> 26,26,768
x1_in = torch.cat([x1_in, x1], 1)
# ---------------------------------------------------#
# 第二个特征层
# out1 = (batch_size,255,26,26)
# ---------------------------------------------------#
# 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
out1_branch = self.last_layer1[:5](x1_in)
out1 = self.last_layer1[5:](out1_branch)
# 26,26,256 -> 26,26,128 -> 52,52,128
x2_in = self.last_layer2_conv(out1_branch) # 融合
x2_in = self.last_layer2_upsample(x2_in)
# 52,52,128 + 52,52,256 -> 52,52,384
x2_in = torch.cat([x2_in, x2], 1)
# ---------------------------------------------------#
# 第三个特征层
# out3 = (batch_size,255,52,52)
# ---------------------------------------------------#
# 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
out2 = self.last_layer2(x2_in)
return out0, out1, out2

488
nets/yolo_training.py Normal file
View File

@ -0,0 +1,488 @@
import math
from functools import partial
import numpy as np
import torch
import torch.nn as nn
class YOLOLoss(nn.Module):
def __init__(self, anchors, num_classes, input_shape, cuda, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]]):
super(YOLOLoss, self).__init__()
# -----------------------------------------------------------#
# 13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
# 26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
# 52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
# -----------------------------------------------------------#
self.anchors = anchors
self.num_classes = num_classes
self.bbox_attrs = 5 + num_classes
self.input_shape = input_shape
self.anchors_mask = anchors_mask
self.giou = True
self.balance = [0.4, 1.0, 4]
self.box_ratio = 0.05
self.obj_ratio = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
self.cls_ratio = 1 * (num_classes / 80)
self.ignore_threshold = 0.5
self.cuda = cuda
def clip_by_tensor(self, t, t_min, t_max):
t = t.float()
result = (t >= t_min).float() * t + (t < t_min).float() * t_min # 要么是t要么是t_min
result = (result <= t_max).float() * result + (result > t_max).float() * t_max
return result
def MSELoss(self, pred, target):
return torch.pow(pred - target, 2)
def BCELoss(self, pred, target):
epsilon = 1e-7
pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon) # 保证tensor在 epsilon和1.0 - epsilon之间
output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
return output
def box_giou(self, b1, b2):
"""
输入为
----------
b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
返回为
-------
giou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
"""
# ----------------------------------------------------#
# 求出预测框左上角右下角
# ----------------------------------------------------#
b1_xy = b1[..., :2]
b1_wh = b1[..., 2:4]
b1_wh_half = b1_wh / 2.
b1_mins = b1_xy - b1_wh_half
b1_maxes = b1_xy + b1_wh_half
# ----------------------------------------------------#
# 求出真实框左上角右下角
# ----------------------------------------------------#
b2_xy = b2[..., :2]
b2_wh = b2[..., 2:4]
b2_wh_half = b2_wh / 2.
b2_mins = b2_xy - b2_wh_half
b2_maxes = b2_xy + b2_wh_half
# ----------------------------------------------------#
# 求真实框和预测框所有的iou
# ----------------------------------------------------#
intersect_mins = torch.max(b1_mins, b2_mins)
intersect_maxes = torch.min(b1_maxes, b2_maxes)
intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
b1_area = b1_wh[..., 0] * b1_wh[..., 1]
b2_area = b2_wh[..., 0] * b2_wh[..., 1]
union_area = b1_area + b2_area - intersect_area
iou = intersect_area / union_area
# ----------------------------------------------------#
# 找到包裹两个框的最小框的左上角和右下角
# ----------------------------------------------------#
enclose_mins = torch.min(b1_mins, b2_mins)
enclose_maxes = torch.max(b1_maxes, b2_maxes)
enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
# ----------------------------------------------------#
# 计算对角线距离
# ----------------------------------------------------#
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
giou = iou - (enclose_area - union_area) / enclose_area
return giou
def forward(self, l, input, targets=None):
# ----------------------------------------------------#
# l代表的是当前输入进来的有效特征层是第几个有效特征层
# input的shape为 bs, 3*(5+num_classes), 13, 13
# bs, 3*(5+num_classes), 26, 26
# bs, 3*(5+num_classes), 52, 52
# targets代表的是真实框。
# ----------------------------------------------------#
# --------------------------------#
# 获得图片数量,特征层的高和宽
# 13和13
# --------------------------------#
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
# -----------------------------------------------------------------------#
# 计算步长
# 每一个特征点对应原来的图片上多少个像素点
# 如果特征层为13x13的话一个特征点就对应原来的图片上的32个像素点
# 如果特征层为26x26的话一个特征点就对应原来的图片上的16个像素点
# 如果特征层为52x52的话一个特征点就对应原来的图片上的8个像素点
# stride_h = stride_w = 32、16、8
# stride_h和stride_w都是32。
# -----------------------------------------------------------------------#
stride_h = self.input_shape[0] / in_h
stride_w = self.input_shape[1] / in_w
# -------------------------------------------------#
# 把anchor转换到此时获得的scaled_anchors大小是相对于特征层的
# -------------------------------------------------#
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors] # 把anchor也缩放到与输出特征图相同尺度
# -----------------------------------------------#
# 输入的input一共有三个他们的shape分别是
# bs, 3*(5+num_classes), 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
# batch_size, 3, 26, 26, 5 + num_classes
# batch_size, 3, 52, 52, 5 + num_classes
# -----------------------------------------------#
prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(
0, 1, 3, 4, 2).contiguous() # batch_size, 3种anchor, h, w, 单个anchor对应的25个输出值
# -----------------------------------------------#
# 先验框的中心位置的调整参数
# -----------------------------------------------#
x = torch.sigmoid(prediction[..., 0]) # prediction[..., 0] 维度是8, 3, 13, 13 取tx坐标
y = torch.sigmoid(prediction[..., 1]) # ty
# -----------------------------------------------#
# 先验框的宽高调整参数
# -----------------------------------------------#
w = prediction[..., 2] # tw
h = prediction[..., 3] # th
# -----------------------------------------------#
# 获得置信度,是否有物体
# -----------------------------------------------#
conf = torch.sigmoid(prediction[..., 4]) # prediction[..., 4] 是否有目标
# -----------------------------------------------#
# 种类置信度
# -----------------------------------------------#
pred_cls = torch.sigmoid(prediction[..., 5:])
# -----------------------------------------------#
# 获得网络应该有的预测结果 y_true是重新建立的真实标签 8, 3, 13, 13, 25. noobj_mask中有目标为0其他为1. box_loss_scale记录了面积
# -----------------------------------------------#
y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)
# y_true中是用 真实框转换为 与网络输出一致的格式。比如坐标是在输出特征分辨率下的类别是真实框所在的cell对应的类别。
# ---------------------------------------------------------------#
# 将预测结果进行解码,判断预测结果和真实值的重合程度
# 如果重合程度过大则忽略,因为这些特征点属于预测比较准确的特征点
# 作为负样本不合适 # l在这里是三个多尺度特征图的第几个 pred_boxes是生成的网络预测的结果
# ----------------------------------------------------------------#
noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
if self.cuda:
y_true = y_true.type_as(x)
noobj_mask = noobj_mask.type_as(x)
box_loss_scale = box_loss_scale.type_as(x)
# --------------------------------------------------------------------------#
# box_loss_scale是真实框宽高的乘积宽高均在0-1之间因此乘积也在0-1之间。
# 2-宽高的乘积代表真实框越大,比重越小,小框的比重更大。
# --------------------------------------------------------------------------#
box_loss_scale = 2 - box_loss_scale
loss = 0
obj_mask = y_true[..., 4] == 1
n = torch.sum(obj_mask)
if n != 0:
if self.giou:
# ---------------------------------------------------------------#
# 计算预测结果和真实结果的giou
# ----------------------------------------------------------------#
giou = self.box_giou(pred_boxes, y_true[..., :4]).type_as(x)
loss_loc = torch.mean((1 - giou)[obj_mask]) # 这里用的GIOU 作为定位误差而不是论文中的MSE
else:
# -----------------------------------------------------------#
# 计算中心偏移情况的loss使用BCELoss效果好一些
# -----------------------------------------------------------#
loss_x = torch.mean(self.BCELoss(x[obj_mask], y_true[..., 0][obj_mask]) * box_loss_scale[obj_mask])
loss_y = torch.mean(self.BCELoss(y[obj_mask], y_true[..., 1][obj_mask]) * box_loss_scale[obj_mask])
# -----------------------------------------------------------#
# 计算宽高调整值的loss
# -----------------------------------------------------------#
loss_w = torch.mean(self.MSELoss(w[obj_mask], y_true[..., 2][obj_mask]) * box_loss_scale[obj_mask])
loss_h = torch.mean(self.MSELoss(h[obj_mask], y_true[..., 3][obj_mask]) * box_loss_scale[obj_mask])
loss_loc = (loss_x + loss_y + loss_h + loss_w) * 0.1
# pred_cls[obj_mask] 有目标的框数* 20个属性值20个分类
loss_cls = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask])) # 目标的分类误差
loss += loss_loc * self.box_ratio + loss_cls * self.cls_ratio
loss_conf = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask]) # 忽略掉部分重叠高的但不是最匹配的预测框 的是否有目标的误差
loss += loss_conf * self.balance[l] * self.obj_ratio # self.balance[l]不同层的权重不一样 [0.4, 1.0, 4] 表示对小目标损失权重更大
# if n != 0:
# print(loss_loc * self.box_ratio, loss_cls * self.cls_ratio, loss_conf * self.balance[l] * self.obj_ratio)
return loss
def calculate_iou(self, _box_a, _box_b):
# -----------------------------------------------------------#
# 计算真实框的左上角和右下角 以0,0为中心点计算左上角和右下角
# -----------------------------------------------------------#
b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
# -----------------------------------------------------------#
# 计算先验框获得的预测框的左上角和右下角
# -----------------------------------------------------------#
b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2
# -----------------------------------------------------------#
# 将真实框和预测框都转化成左上角右下角的形式
# -----------------------------------------------------------#
box_a = torch.zeros_like(_box_a)
box_b = torch.zeros_like(_box_b)
box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2
# ----------------------------------------------------------- #
# A为真实框的数量B为先验框的数量
# ----------------------------------------------------------- #
A = box_a.size(0)
B = box_b.size(0)
# ----------------------------------------------------------- #
# 计算交的面积 box_a是真实框左上角和右下角。 box_b是先验框的左上角和右下角
# box_a[:, 2:].unsqueeze(1).expand(A, B, 2) 从 5, 1, 2 扩展到5, 9, 2。 这里的5是图中框的数量。每一个组有9个5个框重复9次
# box_b[:, 2:].unsqueeze(0).expand(A, B, 2) 从 1, 9, 2 扩展到5, 9, 2。 这里的每一个组9个是不一样的9个anchor框重复5次。
# ----------------------------------------------------------- #
# 每一个gt复制 len(anchors)次然后与所有anchors比较
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), # 计算右下角的最小点
box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) # 输出 592
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), # 计算左上角的最大点
box_b[:, :2].unsqueeze(0).expand(A, B, 2)) # 输出 592
inter = torch.clamp((max_xy - min_xy), # 这里无法判断两个框不相交的情况。但不相交 U 就大,所以应该不影响结果
min=0) # 最小值是0最大值不限。相减之后得到宽和高。# input输入张量 min范围的最小值如果不指定的话会默认无下界 max范围的最大值如果不指定的话会默认无上界
inter = inter[:, :, 0] * inter[:, :, 1] # 每个真实框与锚框 相交的面积
# ----------------------------------------------------------- #
# 计算预测框和真实框各自的面积
# ----------------------------------------------------------- #
area_a = ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(
inter) # [A,B] 5个值重复9次
area_b = ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(
inter) # [A,B] 9个值重复5次
# ----------------------------------------------------------- #
# 求IOU
# ----------------------------------------------------------- #
union = area_a + area_b - inter
return inter / union # [A,B]
def get_target(self, l, targets, anchors, in_h, in_w):
# -----------------------------------------------------#
# 计算一共有多少张图片
# -----------------------------------------------------#
bs = len(targets)
# -----------------------------------------------------#
# 对每一个grid cell都需要标记。用于选取哪些先验框不包含物体
# -----------------------------------------------------#
noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad=False)
# -----------------------------------------------------#
# 让网络更加去关注小目标
# -----------------------------------------------------#
box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad=False)
# -----------------------------------------------------#
# batch_size, 3, 13, 13, 5 + num_classes
# -----------------------------------------------------#
y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad=False)
for b in range(bs): # 每张图片单独计算
if len(targets[b]) == 0: # targets是真实框
continue
batch_target = torch.zeros_like(targets[b]) # 把0~1之间的targets转换到 特征图大小的 targets
# -------------------------------------------------------#
# 计算出正样本在特征层上的中心点 # box第01维记录中心点 box第23维记录宽高 # 这里不知道为何这样做,但结果一样的
# -------------------------------------------------------#
batch_target[:, [0, 2]] = targets[b][:, [0, 2]] * in_w # 从归一化的box中反解出在 13*13 分辨率下的大小 两个 x 坐标
batch_target[:, [1, 3]] = targets[b][:, [1, 3]] * in_h
batch_target[:, 4] = targets[b][:, 4]
batch_target = batch_target.cpu() # 因为是从targets放在cuda上中复制过来的所以需要执行一次cpu()
# -------------------------------------------------------#
# 将真实框转换一个形式 相当于都放到0, 0, w, h 进行比较
# num_true_box, 4 # 把23 维也就是宽和高取出前面拼两个0
# -------------------------------------------------------#
gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
# -------------------------------------------------------#
# 将先验框转换一个形式
# 9, 4 在先验框大小前面加了两个0
# -------------------------------------------------------#
anchor_shapes = torch.FloatTensor(
torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
# -------------------------------------------------------#
# 计算交并比
# self.calculate_iou(gt_box, anchor_shapes) = [num_true_box, 9]每一个真实框和9个先验框的重合情况
# best_ns:
# [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号] # self.calculate_iou(gt_box, anchor_shapes) 的结果,是 b x len(anchors)
# -------------------------------------------------------#
best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1) # 找到每个真实框与所有anchor的IoU然后取出每个真实框最匹配的anchor下标
# 依次遍历每个真实框对应的anchor号数找到在 所属当前层的3个anchor中的下标
for t, best_n in enumerate(best_ns): # l是最后输出的多层特征图第几层
if best_n not in self.anchors_mask[l]: # self.anchors_mask的用法是指定当前特征图用的是哪3个anchor
continue
# ----------------------------------------#
# 判断这个先验框是当前特征点的哪一个先验框 l是第几号最后的输出特征图
# ----------------------------------------#
k = self.anchors_mask[l].index(best_n) # 使用当前层对应anchors的第几号anchor
# ----------------------------------------#
# 获得真实框属于哪个网格点 获取中心点。因为映射到了13*13分辨率上。 floor不就是左上角的意思
# ----------------------------------------#
i = torch.floor(batch_target[t, 0]).long() # t 表示当前是第几个真实框
j = torch.floor(batch_target[t, 1]).long()
# ----------------------------------------#
# 取出真实框的种类
# ----------------------------------------#
c = batch_target[t, 4].long()
# ----------------------------------------#
# noobj_mask代表无目标的特征点 b是几号batchk是几号anchor
# ----------------------------------------#
noobj_mask[b, k, j, i] = 0
# ----------------------------------------#
# tx、ty代表中心调整参数的真实值
# ----------------------------------------#
if not self.giou: # 不走这条分支
# ----------------------------------------#
# tx、ty代表中心调整参数的真实值
# ----------------------------------------#
y_true[b, k, j, i, 0] = batch_target[t, 0] - i.float()
y_true[b, k, j, i, 1] = batch_target[t, 1] - j.float()
y_true[b, k, j, i, 2] = math.log(batch_target[t, 2] / anchors[best_n][0])
y_true[b, k, j, i, 3] = math.log(batch_target[t, 3] / anchors[best_n][1])
y_true[b, k, j, i, 4] = 1
y_true[b, k, j, i, c + 5] = 1 # 重新设置标记种类
else:
# ----------------------------------------#
# tx、ty代表中心调整参数的真实值  重新生成的标签 y_true t是当前的图像的第t个真实框
# ----------------------------------------#
y_true[b, k, j, i, 0] = batch_target[t, 0]
y_true[b, k, j, i, 1] = batch_target[t, 1]
y_true[b, k, j, i, 2] = batch_target[t, 2]
y_true[b, k, j, i, 3] = batch_target[t, 3]
y_true[b, k, j, i, 4] = 1 # 有物体
y_true[b, k, j, i, c + 5] = 1 # c是种类
# ----------------------------------------#
# 用于获得xywh的比例
# 大目标loss权重小小目标loss权重大
# ----------------------------------------#
box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h # 这里计算出面积能反应大小目标。又归一化到0~1之间。
return y_true, noobj_mask, box_loss_scale
def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
# -----------------------------------------------------#
# 计算一共有多少张图片
# -----------------------------------------------------#
bs = len(targets)
# -----------------------------------------------------#
# 生成网格,先验框中心,网格左上角 torch.linspace(0, in_w - 1, in_w) 在0, in_w - 1之间分成in_w个点。.repeat(in_h, 1)沿0重复in_h次沿1重复1次
# -----------------------------------------------------#
grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x) # 这样写 repeat 比较清晰。repeat从右向左分析比较清晰。后两维是沿着竖轴和横轴重复指定次数。
grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)
# 生成先验框的宽高
scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]] # 取出对应的3个先验框的具体值
anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x) # 沿1维度找到第几维值
anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape) # 13*13 个一样的形成一组。3个不一样的13*13。 x8次
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
# -------------------------------------------------------#
# 计算调整后的先验框中心与宽高 x是输出的第0属性就是x的sigmoid的输出坐标
# -------------------------------------------------------#
pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim=-1)
for b in range(bs): # 对一个 batch 里的数据 一张张图像 分别进行操作
# -------------------------------------------------------#
# 将预测结果转换一个形式
# pred_boxes_for_ignore num_anchors, 4
# -------------------------------------------------------#
pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
# -------------------------------------------------------#
# 计算真实框,并把真实框转换成相对于特征层的大小
# gt_box num_true_box, 4
# -------------------------------------------------------#
if len(targets[b]) > 0: # 如果有目标,进行下面的操作。否则 跳到下一张图片。
batch_target = torch.zeros_like(targets[b])
# -------------------------------------------------------#
# 计算出正样本在特征层上的中心点 # 这里地方好像也是把 box当前左上角和右下角的形式实现已经变成了中心点与宽高的形式。但无论如何最终的结果没变。
# -------------------------------------------------------#
batch_target[:, [0, 2]] = targets[b][:, [0, 2]] * in_w
batch_target[:, [1, 3]] = targets[b][:, [1, 3]] * in_h
batch_target = batch_target[:, :4].type_as(x)
# -------------------------------------------------------#
# 计算交并比
# anch_ious num_true_box, num_anchors
# -------------------------------------------------------#
anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore) # 真实框与预测框的IoU
# -------------------------------------------------------#
# 每个先验框???对应真实框的最大重合度
# anch_ious_max num_anchors
# -------------------------------------------------------#
anch_ious_max, _ = torch.max(anch_ious, dim=0) # 每个真实框与预测框的最大值。
anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0 # 如果大于某个阈值即使不是最匹配的也可以忽略这个cell。所以noobj设置为0。
return noobj_mask, pred_boxes
def weights_init(net, init_type='normal', init_gain=0.02):
def init_func(m):
classname = m.__class__.__name__
if hasattr(m, 'weight') and classname.find('Conv') != -1:
if init_type == 'normal':
torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
elif init_type == 'xavier':
torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
elif init_type == 'kaiming':
torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif init_type == 'orthogonal':
torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
else:
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
elif classname.find('BatchNorm2d') != -1:
torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
torch.nn.init.constant_(m.bias.data, 0.0)
print('initialize network with %s type' % init_type)
net.apply(init_func)
def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio=0.05, warmup_lr_ratio=0.1,
no_aug_iter_ratio=0.05, step_num=10):
def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters):
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
elif iters >= total_iters - no_aug_iter:
lr = min_lr
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0 + math.cos(
math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iter))
)
return lr
def step_lr(lr, decay_rate, step_size, iters):
if step_size < 1:
raise ValueError("step_size must above 1.")
n = iters // step_size
out_lr = lr * decay_rate ** n
return out_lr
if lr_decay_type == "cos":
warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3)
warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6)
no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15)
func = partial(yolox_warm_cos_lr, lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter)
else:
decay_rate = (min_lr / lr) ** (1 / (step_num - 1))
step_size = total_iters / step_num
func = partial(step_lr, lr, decay_rate, step_size)
return func
def set_optimizer_lr(optimizer, lr_scheduler_func, epoch):
lr = lr_scheduler_func(epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr

135
patch_config.py Normal file
View File

@ -0,0 +1,135 @@
from torch import optim
class BaseConfig(object):
"""
Default parameters for all config files.
"""
def __init__(self):
"""
Set the defaults.
"""
# self.img_dir = "inria/Train/pos"
# self.lab_dir = "inria/Train/pos/yolo-labels"
self.img_dir = "cctsdb/Train/pos"
self.lab_dir = "cctsdb/Train/labels"
self.cfgfile = "cfg/yolo.cfg"
self.weightfile = "weights/yolo.weights"
self.printfile = "non_printability/30values.txt"
self.patch_size = 300
self.start_learning_rate = 0.03
self.patch_name = 'base'
self.scheduler_factory = lambda x: optim.lr_scheduler.ReduceLROnPlateau(x, 'min', patience=50)
self.max_tv = 0
self.batch_size = 20
self.loss_target = lambda obj, cls: obj * cls
class Experiment1(BaseConfig):
"""
Model that uses a maximum total variation, tv cannot go below this point.
"""
def __init__(self):
"""
Change stuff...
"""
super().__init__()
self.patch_name = 'Experiment1'
self.max_tv = 0.165
class Experiment2HighRes(Experiment1):
"""
Higher res
"""
def __init__(self):
"""
Change stuff...
"""
super().__init__()
self.max_tv = 0.165
self.patch_size = 400
self.patch_name = 'Exp2HighRes'
class Experiment3LowRes(Experiment1):
"""
Lower res
"""
def __init__(self):
"""
Change stuff...
"""
super().__init__()
self.max_tv = 0.165
self.patch_size = 100
self.patch_name = "Exp3LowRes"
class Experiment4ClassOnly(Experiment1):
"""
Only minimise class score.
"""
def __init__(self):
"""
Change stuff...
"""
super().__init__()
self.patch_name = 'Experiment4ClassOnly'
self.loss_target = lambda obj, cls: cls
class Experiment1Desktop(Experiment1):
"""
"""
def __init__(self):
"""
Change batch size.
"""
super().__init__()
self.batch_size = 8
self.patch_size = 400
class ReproducePaperObj(BaseConfig):
"""
Reproduce the results from the paper: Generate a patch that minimises object score.
"""
def __init__(self):
super().__init__()
self.batch_size = 8
self.patch_size = 300
self.patch_name = 'ObjectOnlyPaper'
self.max_tv = 0.165
self.loss_target = lambda obj, cls: obj
patch_configs = {
"base": BaseConfig,
"exp1": Experiment1,
"exp1_des": Experiment1Desktop,
"exp2_high_res": Experiment2HighRes,
"exp3_low_res": Experiment3LowRes,
"exp4_class_only": Experiment4ClassOnly,
"paper_obj": ReproducePaperObj
}

181
predict.py Normal file
View File

@ -0,0 +1,181 @@
# -----------------------------------------------------------------------#
# predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能
# 整合到了一个py文件中通过指定mode进行模式的修改。
# -----------------------------------------------------------------------#
import time
import cv2
import numpy as np
from PIL import Image
from yolo import YOLO
if __name__ == "__main__":
yolo = YOLO()
# ----------------------------------------------------------------------------------------------------------#
# mode用于指定测试的模式
# 'predict' 表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释
# 'video' 表示视频检测,可调用摄像头或者视频进行检测,详情查看下方注释。
# 'fps' 表示测试fps使用的图片是img里面的street.jpg详情查看下方注释。
# 'dir_predict' 表示遍历文件夹进行检测并保存。默认遍历img文件夹保存img_out文件夹详情查看下方注释。
# 'heatmap' 表示进行预测结果的热力图可视化,详情查看下方注释。
# 'export_onnx' 表示将模型导出为onnx需要pytorch1.7.1以上。
# ----------------------------------------------------------------------------------------------------------#
mode = "predict"
# -------------------------------------------------------------------------#
# crop 指定了是否在单张图片预测后对目标进行截取
# count 指定了是否进行目标的计数
# crop、count仅在mode='predict'时有效
# -------------------------------------------------------------------------#
crop = False
count = False
# ----------------------------------------------------------------------------------------------------------#
# video_path 用于指定视频的路径当video_path=0时表示检测摄像头
# 想要检测视频则设置如video_path = "xxx.mp4"即可代表读取出根目录下的xxx.mp4文件。
# video_save_path 表示视频保存的路径当video_save_path=""时表示不保存
# 想要保存视频则设置如video_save_path = "yyy.mp4"即可代表保存为根目录下的yyy.mp4文件。
# video_fps 用于保存的视频的fps
#
# video_path、video_save_path和video_fps仅在mode='video'时有效
# 保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
# ----------------------------------------------------------------------------------------------------------#
video_path = 0
video_save_path = ""
video_fps = 25.0
# ----------------------------------------------------------------------------------------------------------#
# test_interval 用于指定测量fps的时候图片检测的次数。理论上test_interval越大fps越准确。
# fps_image_path 用于指定测试的fps图片
#
# test_interval和fps_image_path仅在mode='fps'有效
# ----------------------------------------------------------------------------------------------------------#
test_interval = 100
fps_image_path = "img/street.jpg"
# -------------------------------------------------------------------------#
# dir_origin_path 指定了用于检测的图片的文件夹路径
# dir_save_path 指定了检测完图片的保存路径
#
# dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
# -------------------------------------------------------------------------#
dir_origin_path = "img/"
dir_save_path = "img_out/"
# -------------------------------------------------------------------------#
# heatmap_save_path 热力图的保存路径默认保存在model_data下
#
# heatmap_save_path仅在mode='heatmap'有效
# -------------------------------------------------------------------------#
heatmap_save_path = "model_data/heatmap_vision.png"
# -------------------------------------------------------------------------#
# simplify 使用Simplify onnx
# onnx_save_path 指定了onnx的保存路径
# -------------------------------------------------------------------------#
simplify = True
onnx_save_path = "model_data/models.onnx"
if mode == "predict":
'''
1如果想要进行检测完的图片的保存利用r_image.save("img.jpg")即可保存直接在predict.py里进行修改即可
2如果想要获得预测框的坐标可以进入yolo.detect_image函数在绘图部分读取topleftbottomright这四个值
3如果想要利用预测框截取下目标可以进入yolo.detect_image函数在绘图部分利用获取到的topleftbottomright这四个值
在原图上利用矩阵的方式进行截取
4如果想要在预测图上写额外的字比如检测到的特定目标的数量可以进入yolo.detect_image函数在绘图部分对predicted_class进行判断
比如判断if predicted_class == 'car': 即可判断当前目标是否为车然后记录数量即可利用draw.text即可写字
'''
while True:
img = input('Input image filename:')
# img/street.jpg
# img/street_a3.jpg
try:
image = Image.open(img)
except:
print('Open Error! Try again!')
continue
else:
r_image = yolo.detect_image(image, crop=crop, count=count)
# r_image.show()
r_image.save("duffision.png")
elif mode == "video":
capture = cv2.VideoCapture(video_path)
if video_save_path != "":
fourcc = cv2.VideoWriter_fourcc(*'XVID')
size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
ref, frame = capture.read()
if not ref:
raise ValueError("未能正确读取摄像头(视频),请注意是否正确安装摄像头(是否正确填写视频路径)。")
fps = 0.0
while (True):
t1 = time.time()
# 读取某一帧
ref, frame = capture.read()
if not ref:
break
# 格式转变BGRtoRGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 转变成Image
frame = Image.fromarray(np.uint8(frame))
# 进行检测
frame = np.array(yolo.detect_image(frame))
# RGBtoBGR满足opencv显示格式
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
fps = (fps + (1. / (time.time() - t1))) / 2
print("fps= %.2f" % (fps))
frame = cv2.putText(frame, "fps= %.2f" % (fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("video", frame)
c = cv2.waitKey(1) & 0xff
if video_save_path != "":
out.write(frame)
if c == 27:
capture.release()
break
print("Video Detection Done!")
capture.release()
if video_save_path != "":
print("Save processed video to the path :" + video_save_path)
out.release()
cv2.destroyAllWindows()
elif mode == "fps":
img = Image.open(fps_image_path)
tact_time = yolo.get_FPS(img, test_interval)
print(str(tact_time) + ' seconds, ' + str(1 / tact_time) + 'FPS, @batch_size 1')
elif mode == "dir_predict":
import os
from tqdm import tqdm
img_names = os.listdir(dir_origin_path)
for img_name in tqdm(img_names):
if img_name.lower().endswith(
('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
image_path = os.path.join(dir_origin_path, img_name)
image = Image.open(image_path)
r_image = yolo.detect_image(image)
if not os.path.exists(dir_save_path):
os.makedirs(dir_save_path)
r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)
elif mode == "heatmap":
while True:
img = input('Input image filename:')
try:
image = Image.open(img)
except:
print('Open Error! Try again!')
continue
else:
yolo.detect_heatmap(image, heatmap_save_path)
elif mode == "export_onnx":
yolo.convert_to_onnx(simplify, onnx_save_path)
else:
raise AssertionError(
"Please specify the correct mode: 'predict', 'video', 'fps', 'heatmap', 'export_onnx', 'dir_predict'.")

109
predict_with_windows.py Normal file
View File

@ -0,0 +1,109 @@
import time
import pyautogui
import cv2
import numpy as np
from PIL import Image
from yolo import YOLO
if __name__ == "__main__":
yolo = YOLO()
# ----------------------------------------------------------------------------------------------------------#
# mode用于指定测试的模式
# 'predict' 表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释
# 'video' 表示视频检测,可调用摄像头或者视频进行检测,详情查看下方注释。
# 'fps' 表示测试fps使用的图片是img里面的street.jpg详情查看下方注释。
# 'dir_predict' 表示遍历文件夹进行检测并保存。默认遍历img文件夹保存img_out文件夹详情查看下方注释。
# 'heatmap' 表示进行预测结果的热力图可视化,详情查看下方注释。
# 'export_onnx' 表示将模型导出为onnx需要pytorch1.7.1以上。
# ----------------------------------------------------------------------------------------------------------#
mode = "predict"
# -------------------------------------------------------------------------#
# crop 指定了是否在单张图片预测后对目标进行截取
# count 指定了是否进行目标的计数
# crop、count仅在mode='predict'时有效
# -------------------------------------------------------------------------#
crop = False
count = False
# ----------------------------------------------------------------------------------------------------------#
# video_path 用于指定视频的路径当video_path=0时表示检测摄像头
# 想要检测视频则设置如video_path = "xxx.mp4"即可代表读取出根目录下的xxx.mp4文件。
# video_save_path 表示视频保存的路径当video_save_path=""时表示不保存
# 想要保存视频则设置如video_save_path = "yyy.mp4"即可代表保存为根目录下的yyy.mp4文件。
# video_fps 用于保存的视频的fps
#
# video_path、video_save_path和video_fps仅在mode='video'时有效
# 保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
# ----------------------------------------------------------------------------------------------------------#
video_path = 0
video_save_path = ""
video_fps = 25.0
# ----------------------------------------------------------------------------------------------------------#
# test_interval 用于指定测量fps的时候图片检测的次数。理论上test_interval越大fps越准确。
# fps_image_path 用于指定测试的fps图片
#
# test_interval和fps_image_path仅在mode='fps'有效
# ----------------------------------------------------------------------------------------------------------#
test_interval = 100
fps_image_path = "img/street.jpg"
# -------------------------------------------------------------------------#
# dir_origin_path 指定了用于检测的图片的文件夹路径
# dir_save_path 指定了检测完图片的保存路径
#
# dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
# -------------------------------------------------------------------------#
dir_origin_path = "img/"
dir_save_path = "img_out/"
# -------------------------------------------------------------------------#
# heatmap_save_path 热力图的保存路径默认保存在model_data下
#
# heatmap_save_path仅在mode='heatmap'有效
# -------------------------------------------------------------------------#
heatmap_save_path = "model_data/heatmap_vision.png"
# -------------------------------------------------------------------------#
# simplify 使用Simplify onnx
# onnx_save_path 指定了onnx的保存路径
# -------------------------------------------------------------------------#
simplify = True
onnx_save_path = "model_data/models.onnx"
if mode == "predict":
'''
1如果想要进行检测完的图片的保存利用r_image.save("img.jpg")即可保存直接在predict.py里进行修改即可
2如果想要获得预测框的坐标可以进入yolo.detect_image函数在绘图部分读取topleftbottomright这四个值
3如果想要利用预测框截取下目标可以进入yolo.detect_image函数在绘图部分利用获取到的topleftbottomright这四个值
在原图上利用矩阵的方式进行截取
4如果想要在预测图上写额外的字比如检测到的特定目标的数量可以进入yolo.detect_image函数在绘图部分对predicted_class进行判断
比如判断if predicted_class == 'car': 即可判断当前目标是否为车然后记录数量即可利用draw.text即可写字
'''
while True:
# img = pyautogui.screenshot(region=[300, 50, 200, 100]) # 分别代表:左上角坐标,宽高
# img = pyautogui.screenshot() # 分别代表:左上角坐标,宽高
# 对获取的图片转换成二维矩阵形式后再将RGB转成BGR
# 因为imshow,默认通道顺序是BGR而pyautogui默认是RGB所以要转换一下不然会有点问题
# img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
# img/street.jpg
# img/street_a3.jpg
try:
time.sleep(0.3)
# image = Image.fromarray(np.asarray(pyautogui.screenshot(region=[1920/2, 300, 1920/2, 1080])))
image = Image.fromarray(np.asarray(pyautogui.screenshot()))
except:
print('Open Error! Try again!')
continue
else:
r_image = yolo.detect_image(image, crop=crop, count=count)
img = cv2.cvtColor(np.asarray(r_image), cv2.COLOR_RGB2BGR)
# img = cv2.resize(img, dsize=(1600, 860)) # (宽度,高度)
img = cv2.resize(img, dsize=(1920, 1080)) # (宽度,高度)
cv2.imshow("screen", img)
# time.sleep(1)
cv2.waitKey(1)
c = cv2.waitKey(1) & 0xff
# print(c)
if c == 113:
break

9
requirements.txt Normal file
View File

@ -0,0 +1,9 @@
scipy==1.2.1
numpy==1.17.0
matplotlib==3.1.2
opencv_python==4.1.2.30
torch==1.2.0
torchvision==0.4.0
tqdm==4.60.0
Pillow==8.2.0
h5py==2.10.0

34
summary.py Normal file
View File

@ -0,0 +1,34 @@
# --------------------------------------------#
# 该部分代码用于看网络结构
# --------------------------------------------#
import torch
# from thop import clever_format, profile
from torchsummary import summary
from nets.yolo import YoloBody
if __name__ == "__main__":
input_shape = [416, 416]
anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
num_classes = 80
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
m = YoloBody(anchors_mask, num_classes)
print(m)
print('-' * 80)
m = m.to(device)
summary(m, (3, input_shape[0], input_shape[1]))
# dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device)
# flops, params = profile(m.to(device), (dummy_input,), verbose=False)
# --------------------------------------------------------#
# flops * 2是因为profile没有将卷积作为两个operations
# 有些论文将卷积算乘法、加法两个operations。此时乘2
# 有些论文只考虑乘法的运算次数忽略加法。此时不乘2
# 本代码选择乘2参考YOLOX。
# --------------------------------------------------------#
# flops = flops * 2
# flops, params = clever_format([flops, params], "%.3f")
# print('Total GFLOPS: %s' % (flops))
# print('Total params: %s' % (params))

225
train_patch.py Normal file
View File

@ -0,0 +1,225 @@
"""
Training code for Adversarial patch training
"""
import PIL
from torch.utils.tensorboard import SummaryWriter
# import load_data
from tqdm import tqdm
from load_data import * # 可能导致多次导入问题?
import gc
import matplotlib.pyplot as plt
from torch import autograd
from torchvision import transforms
import subprocess
import patch_config
import sys
import time
from yolo import YOLO
class PatchTrainer(object):
def __init__(self, mode):
self.config = patch_config.patch_configs[mode]() # 获取对应的配置类
# self.darknet_model = Darknet(self.config.cfgfile) # 加载yolo模型
# self.darknet_model.load_weights(self.config.weightfile) # 默认 YOLOv2 MS COCO weights person编号是0
self.darknet_model = YOLO().net
self.darknet_model = self.darknet_model.eval().cuda() # TODO: Why eval?
self.patch_applier = PatchApplier().cuda() # 对图像应用对抗补丁
self.patch_transformer = PatchTransformer().cuda() # 变换补丁到指定大小并产生抖动
# self.prob_extractor = MaxProbExtractor(0, 80, self.config).cuda() # 提取最大类别概率
self.prob_extractor = MaxProbExtractor(0, 1, self.config).cuda() # 提取最大类别概率
self.nps_calculator = NPSCalculator(self.config.printfile, self.config.patch_size).cuda() # 不可打印分数
self.total_variation = TotalVariation().cuda() # 计算补丁的所有变化程度
self.writer = self.init_tensorboard(mode)
def init_tensorboard(self, name=None):
subprocess.Popen(['tensorboard', '--logdir=runs'])
if name is not None:
time_str = time.strftime("%Y%m%d-%H%M%S")
return SummaryWriter(f'runs/{time_str}_{name}')
else:
return SummaryWriter()
def train(self):
"""
Optimize a patch to generate an adversarial example.
:return: Nothing
"""
img_size = self.darknet_model.height # 416
# print('batch_size:',batch_size)
batch_size = self.config.batch_size # 8
n_epochs = 200
# n_epochs = 5
# max_lab = 20 # label的最大长度
max_lab = 8
time_str = time.strftime("%Y%m%d-%H%M%S")
# Generate stating point
# adv_patch_cpu = self.generate_patch("gray") # 生成一个灰图初始化为0.5
adv_patch_cpu = self.read_image("saved_patches/patchnew0.jpg")
adv_patch_cpu.requires_grad_(True)
train_loader = torch.utils.data.DataLoader(
InriaDataset(self.config.img_dir, self.config.lab_dir, max_lab, img_size,
shuffle=True),
batch_size=batch_size,
shuffle=True,
num_workers=0) # 与 from load_data import * 搭配导致多少导入?
self.epoch_length = len(train_loader)
print(f'One epoch is {len(train_loader)}')
optimizer = optim.Adam([adv_patch_cpu], lr=self.config.start_learning_rate, amsgrad=True) # 更新的是那个补丁
scheduler = self.config.scheduler_factory(optimizer) # ICLR-2018年最佳论文提出的Adam改进版Amsgrad
et0 = time.time()
for epoch in range(n_epochs):
ep_det_loss = 0
ep_nps_loss = 0
ep_tv_loss = 0
ep_loss = 0
bt0 = time.time()
for i_batch, (img_batch, lab_batch) in tqdm(enumerate(train_loader), desc=f'Running epoch {epoch}',
total=self.epoch_length):
with autograd.detect_anomaly(): # 1.运行前向时开启异常检测功能,则在反向时会打印引起反向失败的前向操作堆栈 2.反向计算出现“nan”时引发异常
img_batch = img_batch.cuda() # 8, 3, 416, 416
lab_batch = lab_batch.cuda() # 8, 14, 5 为什么要把人数的标签补到14?
# print('TRAINING EPOCH %i, BATCH %i'%(epoch, i_batch))
adv_patch = adv_patch_cpu.cuda() # 3, 300, 300
adv_batch_t = self.patch_transformer(adv_patch, lab_batch, img_size, do_rotate=True, rand_loc=False)
p_img_batch = self.patch_applier(img_batch, adv_batch_t)
p_img_batch = F.interpolate(p_img_batch,
(self.darknet_model.height, self.darknet_model.width)) # 确保和图片大小一致
# print('++++++++++++p_img_batch:+++++++++++++',p_img_batch.shape)
img = p_img_batch[1, :, :, ]
img = transforms.ToPILImage()(img.detach().cpu())
# img.show()
outputs = self.darknet_model(p_img_batch) # 输入83416416 输出8425 13 13 ,其中425是5*(5+80)
max_prob = 0
nps = 0
tv = 0
for l in range(len(outputs)): # 三组不同分辨率大小的输出特征分别计算
output = outputs[l]
max_prob += self.prob_extractor(output)
nps += self.nps_calculator(adv_patch)
tv += self.total_variation(adv_patch)
nps_loss = nps * 0.01
tv_loss = tv * 2.5
det_loss = torch.mean(max_prob) # 把人的置值度当成损失
loss = det_loss + nps_loss + torch.max(tv_loss, torch.tensor(0.1).cuda())
ep_det_loss += det_loss.detach().cpu().numpy()
ep_nps_loss += nps_loss.detach().cpu().numpy()
ep_tv_loss += tv_loss.detach().cpu().numpy()
ep_loss += loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
adv_patch_cpu.data.clamp_(0, 1) # keep patch in image range
bt1 = time.time()
if i_batch % 5 == 0:
iteration = self.epoch_length * epoch + i_batch
self.writer.add_scalar('total_loss', loss.detach().cpu().numpy(), iteration)
self.writer.add_scalar('loss/det_loss', det_loss.detach().cpu().numpy(), iteration)
self.writer.add_scalar('loss/nps_loss', nps_loss.detach().cpu().numpy(), iteration)
self.writer.add_scalar('loss/tv_loss', tv_loss.detach().cpu().numpy(), iteration)
self.writer.add_scalar('misc/epoch', epoch, iteration)
self.writer.add_scalar('misc/learning_rate', optimizer.param_groups[0]["lr"], iteration)
self.writer.add_image('patch', adv_patch_cpu, iteration)
if i_batch + 1 >= len(train_loader):
print('\n')
else:
del adv_batch_t, output, max_prob, det_loss, p_img_batch, nps_loss, tv_loss, loss
torch.cuda.empty_cache()
bt0 = time.time()
et1 = time.time()
ep_det_loss = ep_det_loss / len(train_loader)
ep_nps_loss = ep_nps_loss / len(train_loader)
ep_tv_loss = ep_tv_loss / len(train_loader)
ep_loss = ep_loss / len(train_loader)
# im = transforms.ToPILImage('RGB')(adv_patch_cpu)
# plt.imshow(im)
# plt.savefig(f'pics/{time_str}_{self.config.patch_name}_{epoch}.png')
scheduler.step(ep_loss)
if True:
print(' EPOCH NR: ', epoch),
print('EPOCH LOSS: ', ep_loss)
print(' DET LOSS: ', ep_det_loss)
print(' NPS LOSS: ', ep_nps_loss)
print(' TV LOSS: ', ep_tv_loss)
print('EPOCH TIME: ', et1 - et0)
# im = transforms.ToPILImage('RGB')(adv_patch_cpu)
# plt.imshow(im)
# plt.show()
# im.save("saved_patches/patchnew1.jpg")
im = transforms.ToPILImage('RGB')(adv_patch_cpu)
if epoch >= 3:
im.save(f"saved_patches/patchnew1_t1_{epoch}_{time_str}.jpg")
del adv_batch_t, output, max_prob, det_loss, p_img_batch, nps_loss, tv_loss, loss
torch.cuda.empty_cache()
et0 = time.time()
def generate_patch(self, type):
"""
Generate a random patch as a starting point for optimization.
:param type: Can be 'gray' or 'random'. Whether or not generate a gray or a random patch.
:return:
"""
if type == 'gray':
adv_patch_cpu = torch.full((3, self.config.patch_size, self.config.patch_size), 0.5)
elif type == 'random':
adv_patch_cpu = torch.rand((3, self.config.patch_size, self.config.patch_size))
return adv_patch_cpu
def read_image(self, path):
"""
Read an input image to be used as a patch
:param path: Path to the image to be read.
:return: Returns the transformed patch as a pytorch Tensor.
"""
patch_img = Image.open(path).convert('RGB')
tf = transforms.Resize((self.config.patch_size, self.config.patch_size))
patch_img = tf(patch_img)
tf = transforms.ToTensor()
adv_patch_cpu = tf(patch_img)
return adv_patch_cpu
def main():
if len(sys.argv) != 2:
print('You need to supply (only) a configuration mode.')
print('Possible modes are:')
print(patch_config.patch_configs) # 一般传入paper_obj
# print('sys.argv:',sys.argv)
trainer = PatchTrainer(sys.argv[1])
trainer.train()
if __name__ == '__main__':
main()

1
utils/__init__.py Normal file
View File

@ -0,0 +1 @@
#

241
utils/callbacks.py Normal file
View File

@ -0,0 +1,241 @@
import datetime
import os
import torch
import matplotlib
import scipy.signal
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import shutil
import numpy as np
from PIL import Image
from tqdm import tqdm
from .utils import cvtColor, preprocess_input, resize_image
from .utils_bbox import DecodeBox
from .utils_map import get_coco_map, get_map
matplotlib.use('Agg')
class LossHistory():
def __init__(self, log_dir, model, input_shape):
self.log_dir = log_dir
self.losses = []
self.val_loss = []
os.makedirs(self.log_dir)
self.writer = SummaryWriter(self.log_dir)
try:
dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1])
self.writer.add_graph(model, dummy_input)
except:
pass
def append_loss(self, epoch, loss, val_loss):
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.losses.append(loss)
self.val_loss.append(val_loss)
with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f:
f.write(str(loss))
f.write("\n")
with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f:
f.write(str(val_loss))
f.write("\n")
self.writer.add_scalar('loss', loss, epoch)
self.writer.add_scalar('val_loss', val_loss, epoch)
self.loss_plot()
def loss_plot(self):
iters = range(len(self.losses))
plt.figure()
plt.plot(iters, self.losses, 'red', linewidth=2, label='train loss')
plt.plot(iters, self.val_loss, 'coral', linewidth=2, label='val loss')
try:
if len(self.losses) < 25:
num = 5
else:
num = 15
plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle='--', linewidth=2,
label='smooth train loss')
plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle='--', linewidth=2,
label='smooth val loss')
except:
pass
plt.grid(True)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc="upper right")
plt.savefig(os.path.join(self.log_dir, "epoch_loss.png"))
plt.cla()
plt.close("all")
class EvalCallback():
def __init__(self, net, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, cuda, \
map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True,
MINOVERLAP=0.5, eval_flag=True, period=1):
super(EvalCallback, self).__init__()
self.net = net
self.input_shape = input_shape
self.anchors = anchors
self.anchors_mask = anchors_mask
self.class_names = class_names
self.num_classes = num_classes
self.val_lines = val_lines
self.log_dir = log_dir
self.cuda = cuda
self.map_out_path = map_out_path
self.max_boxes = max_boxes
self.confidence = confidence
self.nms_iou = nms_iou
self.letterbox_image = letterbox_image
self.MINOVERLAP = MINOVERLAP
self.eval_flag = eval_flag
self.period = period
self.bbox_util = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]),
self.anchors_mask)
self.maps = [0]
self.epoches = [0]
if self.eval_flag:
with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
f.write(str(0))
f.write("\n")
def get_map_txt(self, image_id, image, class_names, map_out_path):
f = open(os.path.join(map_out_path, "detection-results/" + image_id + ".txt"), "w", encoding='utf-8')
image_shape = np.array(np.shape(image)[0:2])
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
if results[0] is None:
return
top_label = np.array(results[0][:, 6], dtype='int32')
top_conf = results[0][:, 4] * results[0][:, 5]
top_boxes = results[0][:, :4]
top_100 = np.argsort(top_label)[::-1][:self.max_boxes]
top_boxes = top_boxes[top_100]
top_conf = top_conf[top_100]
top_label = top_label[top_100]
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = str(top_conf[i])
top, left, bottom, right = box
if predicted_class not in class_names:
continue
f.write("%s %s %s %s %s %s\n" % (
predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)), str(int(bottom))))
f.close()
return
def on_epoch_end(self, epoch, model_eval):
if epoch % self.period == 0 and self.eval_flag:
self.net = model_eval
if not os.path.exists(self.map_out_path):
os.makedirs(self.map_out_path)
if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")):
os.makedirs(os.path.join(self.map_out_path, "ground-truth"))
if not os.path.exists(os.path.join(self.map_out_path, "detection-results")):
os.makedirs(os.path.join(self.map_out_path, "detection-results"))
print("Get map.")
for annotation_line in tqdm(self.val_lines):
line = annotation_line.split()
image_id = os.path.basename(line[0]).split('.')[0]
# ------------------------------#
# 读取图像并转换成RGB图像
# ------------------------------#
image = Image.open(line[0])
# ------------------------------#
# 获得预测框
# ------------------------------#
gt_boxes = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
# ------------------------------#
# 获得预测txt
# ------------------------------#
self.get_map_txt(image_id, image, self.class_names, self.map_out_path)
# ------------------------------#
# 获得真实框txt
# ------------------------------#
with open(os.path.join(self.map_out_path, "ground-truth/" + image_id + ".txt"), "w") as new_f:
for box in gt_boxes:
left, top, right, bottom, obj = box
obj_name = self.class_names[obj]
new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
print("Calculate Map.")
try:
temp_map = get_coco_map(class_names=self.class_names, path=self.map_out_path)[1]
except:
temp_map = get_map(self.MINOVERLAP, False, path=self.map_out_path)
self.maps.append(temp_map)
self.epoches.append(epoch)
with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
f.write(str(temp_map))
f.write("\n")
plt.figure()
plt.plot(self.epoches, self.maps, 'red', linewidth=2, label='train map')
plt.grid(True)
plt.xlabel('Epoch')
plt.ylabel('Map %s' % str(self.MINOVERLAP))
plt.title('A Map Curve')
plt.legend(loc="upper right")
plt.savefig(os.path.join(self.log_dir, "epoch_map.png"))
plt.cla()
plt.close("all")
print("Get map done.")
shutil.rmtree(self.map_out_path)

170
utils/dataloader.py Normal file
View File

@ -0,0 +1,170 @@
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data.dataset import Dataset
from utils.utils import cvtColor, preprocess_input
class YoloDataset(Dataset):
def __init__(self, annotation_lines, input_shape, num_classes, train):
super(YoloDataset, self).__init__()
self.annotation_lines = annotation_lines # 记录训练集或测试集的文件的路径,这个是可以全部载入的
self.input_shape = input_shape # 这里是 [416, 416]
self.num_classes = num_classes # 这里是20
self.length = len(self.annotation_lines) # 数据的数量
self.train = train # 是否是训练集的标记
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
# ---------------------------------------------------#
# 训练时进行数据的随机增强
# 验证时不进行数据的随机增强
# ---------------------------------------------------#
image, box = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2],
random=self.train) # 自定义的数据增强
image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1)) # 像素值归到0~1之间然后变换坐标轴
box = np.array(box, dtype=np.float32) # 转为numpy。np中常用的是创建新类型的array。
if len(box) != 0:
box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1] # 把框的坐标归一化
box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
box[:, 2:4] = box[:, 2:4] - box[:, 0:2] # box第01维记录中心点 box第23维记录宽高
box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2 # box第01维记录中心点
return image, box
def rand(self, a=0, b=1):
return np.random.rand() * (b - a) + a
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
line = annotation_line.split() # 以空格、回车等分隔字符串
# ------------------------------#
# 读取图像并转换成RGB图像
# ------------------------------#
image = Image.open(line[0]) # line[0] 是图片的地址
image = cvtColor(image) # 这里啥也没干
# ------------------------------#
# 获得图像的高宽与目标高宽
# ------------------------------#
iw, ih = image.size # 获取图像的原始尺寸
h, w = input_shape
# ------------------------------#
# 获得预测框
# ------------------------------#
box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]]) # 从python二维矩阵转到 numpy二维矩阵
if not random: # 没进入这里面
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
dx = (w - nw) // 2
dy = (h - nh) // 2
# ---------------------------------#
# 将图像多余的部分加上灰条
# ---------------------------------#
image = image.resize((nw, nh), Image.BICUBIC)
new_image = Image.new('RGB', (w, h), (128, 128, 128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# ---------------------------------#
# 对真实框进行调整
# ---------------------------------#
if len(box) > 0:
np.random.shuffle(box)
box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
box[:, 0:2][box[:, 0:2] < 0] = 0
box[:, 2][box[:, 2] > w] = w
box[:, 3][box[:, 3] > h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w > 1, box_h > 1)] # discard invalid box
return image_data, box
# ------------------------------------------#
# 对原始图像进行缩放并且进行长和宽的扭曲
# ------------------------------------------#
new_ar = iw / ih * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter) # (iw*随机) / (ih*随机)
scale = self.rand(.25, 2) # 随机一个缩放比例
if new_ar < 1: # 原图高大
nh = int(scale * h) # 新图先缩放高
nw = int(nh * new_ar)
else: # 原图宽大
nw = int(scale * w) # 新的宽从 预期宽中 乘以随机的比例
nh = int(nw / new_ar) # 新的宽、高比,也是 new_ar, 也就是也是宽大
image = image.resize((nw, nh), Image.BICUBIC)
# ------------------------------------------#
# 将图像多余的部分加上灰条
# ------------------------------------------#
dx = int(self.rand(0, w - nw)) # 在(0, w - nw)找一个点作为新图的放置点
dy = int(self.rand(0, h - nh))
new_image = Image.new('RGB', (w, h), (128, 128, 128)) # 画一个 412, 412大小的灰图
new_image.paste(image, (dx, dy)) # 在这里看看两者的区别
image = new_image
# ------------------------------------------#
# 翻转图像
# ------------------------------------------#
flip = self.rand() < .5
if flip:
image = image.transpose(Image.FLIP_LEFT_RIGHT)
image_data = np.array(image, np.uint8)
# ---------------------------------#
# 对图像进行色域变换
# 计算色域变换的参数
# ---------------------------------#
r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
# ---------------------------------#
# 将图像转到HSV上
# ---------------------------------#
hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
dtype = image_data.dtype
# ---------------------------------#
# 应用变换
# ---------------------------------#
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
# LUT是look-up table查找表的意思,cv2.LUT(src, lut, dst=None)的作用是对输入的src执行查找表lut转换
image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB) # image_data在这里还是unit8类型
# ---------------------------------#
# 对真实框进行调整
# ---------------------------------#
if len(box) > 0: # 如果有box
np.random.shuffle(box)
box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx # 所有行的第0列和2列也就是 x 坐标, 除以iw找到占原图的比例再乘以nw是新图的比例再加dx是新图中的偏移
box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
if flip:
box[:, [0, 2]] = w - box[:, [2, 0]] # 如果有水平翻转则x坐标变换为416-x并且x0 和 x1的位置互换一下
box[:, 0:2][box[:, 0:2] < 0] = 0 # 对于左上角的点在图像外小于0则把对应的位置的坐标置为0 # 右下角的点不会小于0吗
box[:, 2][box[:, 2] > w] = w # 对于右下角的横坐标点超出图的则置为w # 右下角不会超出图吗?
box[:, 3][box[:, 3] > h] = h # 对于右下角的纵坐标点超出图的则置为h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w > 1, box_h > 1)] # 多余的检查如果宽、高大于至少1则保留下来
return image_data, box # box依然是左上角和右下角的形式
# DataLoader中collate_fn使用
def yolo_dataset_collate(batch):
images = [] # 这是是一个batch大小的列表每一项是 image_data, box。需要把image放一堆box放一堆
bboxes = []
for img, box in batch:
images.append(img) # images在这里已经是0~1的float32类型了
bboxes.append(box)
images = torch.from_numpy(np.array(images)).type(torch.FloatTensor) # 转换为 batch_size, C, H, W 的数据
bboxes = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in bboxes] # 转换为一个列表每个元素是一组二维Tensor
return images, bboxes

79
utils/utils.py Normal file
View File

@ -0,0 +1,79 @@
import numpy as np
from PIL import Image
# ---------------------------------------------------------#
# 将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
def cvtColor(image):
if len(np.shape(image)) == 3 and np.shape(image)[2] == 3:
return image
else:
image = image.convert('RGB')
return image
# ---------------------------------------------------#
# 对输入图像进行resize
# ---------------------------------------------------#
def resize_image(image, size, letterbox_image):
iw, ih = image.size
w, h = size
if letterbox_image:
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
image = image.resize((nw, nh), Image.BICUBIC)
new_image = Image.new('RGB', size, (128, 128, 128))
new_image.paste(image, ((w - nw) // 2, (h - nh) // 2))
else:
new_image = image.resize((w, h), Image.BICUBIC) # 这里直接用了缩放,而不是加灰条的形式
return new_image
# ---------------------------------------------------#
# 获得类
# ---------------------------------------------------#
def get_classes(classes_path):
with open(classes_path, encoding='utf-8') as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names, len(class_names)
# ---------------------------------------------------#
# 获得先验框
# ---------------------------------------------------#
def get_anchors(anchors_path):
'''loads the anchors from a file'''
with open(anchors_path, encoding='utf-8') as f:
anchors = f.readline()
anchors = [float(x) for x in anchors.split(',')]
anchors = np.array(anchors).reshape(-1, 2)
return anchors, len(anchors)
# ---------------------------------------------------#
# 获得学习率
# ---------------------------------------------------#
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def preprocess_input(image):
image /= 255.0
return image
def show_config(**kwargs):
print('Configurations:')
print('-' * 70)
print('|%25s | %40s|' % ('keys', 'values'))
print('-' * 70)
for key, value in kwargs.items():
print('|%25s | %40s|' % (str(key), str(value)))
print('-' * 70)

232
utils/utils_bbox.py Normal file
View File

@ -0,0 +1,232 @@
import torch
import torch.nn as nn
from torchvision.ops import nms
import numpy as np
class DecodeBox():
def __init__(self, anchors, num_classes, input_shape, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]]):
super(DecodeBox, self).__init__()
self.anchors = anchors
self.num_classes = num_classes
self.bbox_attrs = 5 + num_classes
self.input_shape = input_shape
# -----------------------------------------------------------#
# 13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
# 26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
# 52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
# -----------------------------------------------------------#
self.anchors_mask = anchors_mask
def decode_box(self, inputs):
outputs = []
for i, input in enumerate(inputs):
# -----------------------------------------------#
# 输入的input一共有三个他们的shape分别是
# batch_size, 255, 13, 13
# batch_size, 255, 26, 26
# batch_size, 255, 52, 52
# -----------------------------------------------#
batch_size = input.size(0)
input_height = input.size(2)
input_width = input.size(3)
# -----------------------------------------------#
# 输入为416x416时
# stride_h = stride_w = 32、16、8
# -----------------------------------------------#
stride_h = self.input_shape[0] / input_height
stride_w = self.input_shape[1] / input_width
# -------------------------------------------------#
# 此时获得的scaled_anchors大小是相对于特征层的
# -------------------------------------------------#
scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in
self.anchors[self.anchors_mask[i]]]
# -----------------------------------------------#
# 输入的input一共有三个他们的shape分别是
# batch_size, 3, 13, 13, 85
# batch_size, 3, 26, 26, 85
# batch_size, 3, 52, 52, 85
# -----------------------------------------------#
prediction = input.view(batch_size, len(self.anchors_mask[i]),
self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
# 调整为 13131325 的形状
# -----------------------------------------------#
# 先验框的中心位置的调整参数
# -----------------------------------------------#
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
# -----------------------------------------------#
# 先验框的宽高调整参数
# -----------------------------------------------#
w = prediction[..., 2]
h = prediction[..., 3]
# -----------------------------------------------#
# 获得置信度,是否有物体
# -----------------------------------------------#
conf = torch.sigmoid(prediction[..., 4])
# -----------------------------------------------#
# 种类置信度
# -----------------------------------------------#
pred_cls = torch.sigmoid(prediction[..., 5:])
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
# ----------------------------------------------------------#
# 生成网格,先验框中心,网格左上角
# batch_size,3,13,13
# ----------------------------------------------------------#
grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)
# ----------------------------------------------------------#
# 按照网格格式生成先验框的宽高
# batch_size,3,13,13
# ----------------------------------------------------------#
anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
# ----------------------------------------------------------#
# 利用预测结果对先验框进行调整
# 首先调整先验框的中心,从先验框中心向右下角偏移 # ?从先验框左上角向右下角偏移?
# 再调整先验框的宽高。
# ----------------------------------------------------------#
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + grid_x
pred_boxes[..., 1] = y.data + grid_y
pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
# ----------------------------------------------------------#
# 将输出结果归一化成小数的形式
# ----------------------------------------------------------#
_scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)
output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,
conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
# output的shape是 batch_size, -1, attr(25)
outputs.append(output.data)
return outputs
def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
# -----------------------------------------------------------------#
# 把y轴放前面是因为方便预测框和图像的宽高进行相乘
# -----------------------------------------------------------------#
box_yx = box_xy[..., ::-1]
box_hw = box_wh[..., ::-1]
input_shape = np.array(input_shape)
image_shape = np.array(image_shape)
if letterbox_image:
# -----------------------------------------------------------------#
# 这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
# new_shape指的是宽高缩放情况
# -----------------------------------------------------------------#
new_shape = np.round(image_shape * np.min(input_shape / image_shape))
offset = (input_shape - new_shape) / 2. / input_shape
scale = input_shape / new_shape
box_yx = (box_yx - offset) * scale
box_hw *= scale
box_mins = box_yx - (box_hw / 2.)
box_maxes = box_yx + (box_hw / 2.)
boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
axis=-1)
boxes *= np.concatenate([image_shape, image_shape], axis=-1)
return boxes
def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5,
nms_thres=0.4):
# ----------------------------------------------------------#
# 将预测结果的格式转换成左上角右下角的格式。
# prediction [batch_size, num_anchors, 85]
# ----------------------------------------------------------#
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))]
for i, image_pred in enumerate(prediction):
# ----------------------------------------------------------#
# 对种类预测部分取max。 # image_pred 是在prediction中以0维度迭代
# class_conf [num_anchors, 1] 种类置信度
# class_pred [num_anchors, 1] 种类 image_pred[:, 5:5 + num_classes] 是取出类别
# ----------------------------------------------------------#
class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
# ----------------------------------------------------------#
# 利用置信度进行第一轮筛选
# ----------------------------------------------------------#
conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
# ----------------------------------------------------------#
# 根据置信度进行预测结果的筛选
# ----------------------------------------------------------#
image_pred = image_pred[conf_mask]
class_conf = class_conf[conf_mask]
class_pred = class_pred[conf_mask]
if not image_pred.size(0):
continue # 如果没有剩下类别,就判断下一张图片
# -------------------------------------------------------------------------#
# detections [num_anchors, 7]
# 7的内容为x1, y1, x2, y2, obj_conf, class_conf, class_pred
# -------------------------------------------------------------------------#
detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
# ------------------------------------------#
# 获得预测结果中包含的所有种类
# ------------------------------------------#
unique_labels = detections[:, -1].cpu().unique()
if prediction.is_cuda:
unique_labels = unique_labels.cuda()
detections = detections.cuda()
for c in unique_labels:
# ------------------------------------------#
# 获得某一类得分筛选后全部的预测结果
# ------------------------------------------#
detections_class = detections[detections[:, -1] == c]
# ------------------------------------------#
# 使用官方自带的非极大抑制会速度更快一些!
# ------------------------------------------#
keep = nms(
detections_class[:, :4],
detections_class[:, 4] * detections_class[:, 5],
nms_thres
)
max_detections = detections_class[keep]
# # 按照存在物体的置信度排序
# _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
# detections_class = detections_class[conf_sort_index]
# # 进行非极大抑制
# max_detections = []
# while detections_class.size(0):
# # 取出这一类置信度最高的一步一步往下判断判断重合程度是否大于nms_thres如果是则去除掉
# max_detections.append(detections_class[0].unsqueeze(0))
# if len(detections_class) == 1:
# break
# ious = bbox_iou(max_detections[-1], detections_class[1:])
# detections_class = detections_class[1:][ious < nms_thres]
# # 堆叠
# max_detections = torch.cat(max_detections).data
# Add max detections to outputs
output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
if output[i] is not None:
output[i] = output[i].cpu().numpy()
box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4]) / 2, output[i][:, 2:4] - output[i][:, 0:2]
output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
return output

151
utils/utils_fit.py Normal file
View File

@ -0,0 +1,151 @@
import os
import torch
from tqdm import tqdm
from utils.utils import get_lr
def fit_one_epoch(model_train, model, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step,
epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0):
loss = 0
val_loss = 0
if local_rank == 0:
print('Start Train')
pbar = tqdm(total=epoch_step, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3)
model_train.train() # 调整所有的模块为train模式
for iteration, batch in enumerate(gen):
if iteration >= epoch_step: # 有什么意义?
break
images, targets = batch[0], batch[1] # targets也是归一化了的
with torch.no_grad():
if cuda:
images = images.cuda(local_rank)
targets = [ann.cuda(local_rank) for ann in
targets] # targets是一个python的list里面是tensor把tensor逐个转到cuda上然后targets还是python的列表
# ----------------------#
# 清零梯度
# ----------------------#
optimizer.zero_grad()
if not fp16:
# ----------------------#
# 前向传播
# ----------------------#
outputs = model_train(images)
loss_value_all = 0
# ----------------------#
# 计算损失
# ----------------------#
for l in range(len(outputs)): # 三组不同分辨率大小的输出特征分别计算
loss_item = yolo_loss(l, outputs[l], targets)
loss_value_all += loss_item
loss_value = loss_value_all
# ----------------------#
# 反向传播
# ----------------------#
loss_value.backward()
optimizer.step()
else: # 不进入这条分支
from torch.cuda.amp import autocast
with autocast():
# ----------------------#
# 前向传播
# ----------------------#
outputs = model_train(images)
loss_value_all = 0
# ----------------------#
# 计算损失
# ----------------------#
for l in range(len(outputs)):
loss_item = yolo_loss(l, outputs[l], targets)
loss_value_all += loss_item
loss_value = loss_value_all
# ----------------------#
# 反向传播
# ----------------------#
scaler.scale(loss_value).backward()
scaler.step(optimizer)
scaler.update()
loss += loss_value.item()
# # 调试用 begin
# if iteration > 2:
# break
# # 调试用 end
if local_rank == 0:
pbar.set_postfix(**{'loss': loss / (iteration + 1),
'lr': get_lr(optimizer)})
pbar.update(1)
if local_rank == 0:
pbar.close()
print('Finish Train')
print('Start Validation')
pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3)
model_train.eval()
for iteration, batch in enumerate(gen_val):
if iteration >= epoch_step_val:
break
images, targets = batch[0], batch[1]
with torch.no_grad():
if cuda:
images = images.cuda(local_rank)
targets = [ann.cuda(local_rank) for ann in targets]
# ----------------------#
# 清零梯度
# ----------------------#
optimizer.zero_grad()
# ----------------------#
# 前向传播
# ----------------------#
outputs = model_train(images)
loss_value_all = 0
# ----------------------#
# 计算损失
# ----------------------#
for l in range(len(outputs)):
loss_item = yolo_loss(l, outputs[l], targets)
loss_value_all += loss_item
loss_value = loss_value_all
val_loss += loss_value.item()
# # 调试用 begin
# if iteration > 2:
# break
# # 调试用 end
if local_rank == 0:
pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
pbar.update(1)
if local_rank == 0:
pbar.close()
print('Finish Validation')
loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val)
eval_callback.on_epoch_end(epoch + 1, model_train)
print('Epoch:' + str(epoch + 1) + '/' + str(Epoch))
print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
# -----------------------------------------------#
# 保存权值
# -----------------------------------------------#
if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch:
torch.save(model.state_dict(), os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (
epoch + 1, loss / epoch_step, val_loss / epoch_step_val)))
if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss):
print('Save best model to best_epoch_weights.pth')
torch.save(model.state_dict(), os.path.join(save_dir, "best_epoch_weights.pth"))
torch.save(model.state_dict(), os.path.join(save_dir, "last_epoch_weights.pth"))

963
utils/utils_map.py Normal file
View File

@ -0,0 +1,963 @@
import glob
import json
import math
import operator
import os
import shutil
import sys
try:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
except:
pass
import cv2
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
'''
0,0 ------> x (width)
|
| (Left,Top)
| *_________
| | |
| |
y |_________|
(height) *
(Right,Bottom)
'''
def log_average_miss_rate(precision, fp_cumsum, num_images):
"""
log-average miss rate:
Calculated by averaging miss rates at 9 evenly spaced FPPI points
between 10e-2 and 10e0, in log-space.
output:
lamr | log-average miss rate
mr | miss rate
fppi | false positives per image
references:
[1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
State of the Art." Pattern Analysis and Machine Intelligence, IEEE
Transactions on 34.4 (2012): 743 - 761.
"""
if precision.size == 0:
lamr = 0
mr = 1
fppi = 0
return lamr, mr, fppi
fppi = fp_cumsum / float(num_images)
mr = (1 - precision)
fppi_tmp = np.insert(fppi, 0, -1.0)
mr_tmp = np.insert(mr, 0, 1.0)
ref = np.logspace(-2.0, 0.0, num=9)
for i, ref_i in enumerate(ref):
j = np.where(fppi_tmp <= ref_i)[-1][-1]
ref[i] = mr_tmp[j]
lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
return lamr, mr, fppi
"""
throw error and exit
"""
def error(msg):
print(msg)
sys.exit(0)
"""
check if the number is a float between 0.0 and 1.0
"""
def is_float_between_0_and_1(value):
try:
val = float(value)
if val > 0.0 and val < 1.0:
return True
else:
return False
except ValueError:
return False
"""
Calculate the AP given the recall and precision array
1st) We compute a version of the measured precision/recall curve with
precision monotonically decreasing
2nd) We compute the AP as the area under this curve by numerical integration.
"""
def voc_ap(rec, prec):
"""
--- Official matlab code VOC2012---
mrec=[0 ; rec ; 1];
mpre=[0 ; prec ; 0];
for i=numel(mpre)-1:-1:1
mpre(i)=max(mpre(i),mpre(i+1));
end
i=find(mrec(2:end)~=mrec(1:end-1))+1;
ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
"""
rec.insert(0, 0.0) # insert 0.0 at begining of list
rec.append(1.0) # insert 1.0 at end of list
mrec = rec[:]
prec.insert(0, 0.0) # insert 0.0 at begining of list
prec.append(0.0) # insert 0.0 at end of list
mpre = prec[:]
"""
This part makes the precision monotonically decreasing
(goes from the end to the beginning)
matlab: for i=numel(mpre)-1:-1:1
mpre(i)=max(mpre(i),mpre(i+1));
"""
for i in range(len(mpre) - 2, -1, -1):
mpre[i] = max(mpre[i], mpre[i + 1])
"""
This part creates a list of indexes where the recall changes
matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
"""
i_list = []
for i in range(1, len(mrec)):
if mrec[i] != mrec[i - 1]:
i_list.append(i) # if it was matlab would be i + 1
"""
The Average Precision (AP) is the area under the curve
(numerical integration)
matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
"""
ap = 0.0
for i in i_list:
ap += ((mrec[i] - mrec[i - 1]) * mpre[i])
return ap, mrec, mpre
"""
Convert the lines of a file to a list
"""
def file_lines_to_list(path):
# open txt file lines to a list
with open(path) as f:
content = f.readlines()
# remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
return content
"""
Draws text in image
"""
def draw_text_in_image(img, text, pos, color, line_width):
font = cv2.FONT_HERSHEY_PLAIN
fontScale = 1
lineType = 1
bottomLeftCornerOfText = pos
cv2.putText(img, text,
bottomLeftCornerOfText,
font,
fontScale,
color,
lineType)
text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
return img, (line_width + text_width)
"""
Plot - adjust axes
"""
def adjust_axes(r, t, fig, axes):
# get text width for re-scaling
bb = t.get_window_extent(renderer=r)
text_width_inches = bb.width / fig.dpi
# get axis width in inches
current_fig_width = fig.get_figwidth()
new_fig_width = current_fig_width + text_width_inches
propotion = new_fig_width / current_fig_width
# get axis limit
x_lim = axes.get_xlim()
axes.set_xlim([x_lim[0], x_lim[1] * propotion])
"""
Draw plot using Matplotlib
"""
def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color,
true_p_bar):
# sort the dictionary by decreasing value, into a list of tuples
sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
# unpacking the list of tuples into two lists
sorted_keys, sorted_values = zip(*sorted_dic_by_value)
#
if true_p_bar != "":
"""
Special case to draw in:
- green -> TP: True Positives (object detected and matches ground-truth)
- red -> FP: False Positives (object detected but does not match ground-truth)
- orange -> FN: False Negatives (object not detected but present in the ground-truth)
"""
fp_sorted = []
tp_sorted = []
for key in sorted_keys:
fp_sorted.append(dictionary[key] - true_p_bar[key])
tp_sorted.append(true_p_bar[key])
plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive',
left=fp_sorted)
# add legend
plt.legend(loc='lower right')
"""
Write number on side of bar
"""
fig = plt.gcf() # gcf - get current figure
axes = plt.gca()
r = fig.canvas.get_renderer()
for i, val in enumerate(sorted_values):
fp_val = fp_sorted[i]
tp_val = tp_sorted[i]
fp_str_val = " " + str(fp_val)
tp_str_val = fp_str_val + " " + str(tp_val)
# trick to paint multicolor with offset:
# first paint everything and then repaint the first number
t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
if i == (len(sorted_values) - 1): # largest bar
adjust_axes(r, t, fig, axes)
else:
plt.barh(range(n_classes), sorted_values, color=plot_color)
"""
Write number on side of bar
"""
fig = plt.gcf() # gcf - get current figure
axes = plt.gca()
r = fig.canvas.get_renderer()
for i, val in enumerate(sorted_values):
str_val = " " + str(val) # add a space before
if val < 1.0:
str_val = " {0:.2f}".format(val)
t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
# re-set axes to show number inside the figure
if i == (len(sorted_values) - 1): # largest bar
adjust_axes(r, t, fig, axes)
# set window title
fig.canvas.set_window_title(window_title)
# write classes in y axis
tick_font_size = 12
plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
"""
Re-scale height accordingly
"""
init_height = fig.get_figheight()
# comput the matrix height in points and inches
dpi = fig.dpi
height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
height_in = height_pt / dpi
# compute the required figure height
top_margin = 0.15 # in percentage of the figure height
bottom_margin = 0.05 # in percentage of the figure height
figure_height = height_in / (1 - top_margin - bottom_margin)
# set new height
if figure_height > init_height:
fig.set_figheight(figure_height)
# set plot title
plt.title(plot_title, fontsize=14)
# set axis titles
# plt.xlabel('classes')
plt.xlabel(x_label, fontsize='large')
# adjust size of window
fig.tight_layout()
# save the plot
fig.savefig(output_path)
# show image
if to_show:
plt.show()
# close the plot
plt.close()
def get_map(MINOVERLAP, draw_plot, score_threhold=0.5, path='./map_out'):
GT_PATH = os.path.join(path, 'ground-truth')
DR_PATH = os.path.join(path, 'detection-results')
IMG_PATH = os.path.join(path, 'images-optional')
TEMP_FILES_PATH = os.path.join(path, '.temp_files')
RESULTS_FILES_PATH = os.path.join(path, 'results')
show_animation = True
if os.path.exists(IMG_PATH):
for dirpath, dirnames, files in os.walk(IMG_PATH):
if not files:
show_animation = False
else:
show_animation = False
if not os.path.exists(TEMP_FILES_PATH):
os.makedirs(TEMP_FILES_PATH)
if os.path.exists(RESULTS_FILES_PATH):
shutil.rmtree(RESULTS_FILES_PATH)
else:
os.makedirs(RESULTS_FILES_PATH)
if draw_plot:
try:
matplotlib.use('TkAgg')
except:
pass
os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision"))
if show_animation:
os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one"))
ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
if len(ground_truth_files_list) == 0:
error("Error: No ground-truth files found!")
ground_truth_files_list.sort()
gt_counter_per_class = {}
counter_images_per_class = {}
for txt_file in ground_truth_files_list:
file_id = txt_file.split(".txt", 1)[0]
file_id = os.path.basename(os.path.normpath(file_id))
temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
if not os.path.exists(temp_path):
error_msg = "Error. File not found: {}\n".format(temp_path)
error(error_msg)
lines_list = file_lines_to_list(txt_file)
bounding_boxes = []
is_difficult = False
already_seen_classes = []
for line in lines_list:
try:
if "difficult" in line:
class_name, left, top, right, bottom, _difficult = line.split()
is_difficult = True
else:
class_name, left, top, right, bottom = line.split()
except:
if "difficult" in line:
line_split = line.split()
_difficult = line_split[-1]
bottom = line_split[-2]
right = line_split[-3]
top = line_split[-4]
left = line_split[-5]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
is_difficult = True
else:
line_split = line.split()
bottom = line_split[-1]
right = line_split[-2]
top = line_split[-3]
left = line_split[-4]
class_name = ""
for name in line_split[:-4]:
class_name += name + " "
class_name = class_name[:-1]
bbox = left + " " + top + " " + right + " " + bottom
if is_difficult:
bounding_boxes.append({"class_name": class_name, "bbox": bbox, "used": False, "difficult": True})
is_difficult = False
else:
bounding_boxes.append({"class_name": class_name, "bbox": bbox, "used": False})
if class_name in gt_counter_per_class:
gt_counter_per_class[class_name] += 1
else:
gt_counter_per_class[class_name] = 1
if class_name not in already_seen_classes:
if class_name in counter_images_per_class:
counter_images_per_class[class_name] += 1
else:
counter_images_per_class[class_name] = 1
already_seen_classes.append(class_name)
with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile:
json.dump(bounding_boxes, outfile)
gt_classes = list(gt_counter_per_class.keys())
gt_classes = sorted(gt_classes)
n_classes = len(gt_classes)
dr_files_list = glob.glob(DR_PATH + '/*.txt')
dr_files_list.sort()
for class_index, class_name in enumerate(gt_classes):
bounding_boxes = []
for txt_file in dr_files_list:
file_id = txt_file.split(".txt", 1)[0]
file_id = os.path.basename(os.path.normpath(file_id))
temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
if class_index == 0:
if not os.path.exists(temp_path):
error_msg = "Error. File not found: {}\n".format(temp_path)
error(error_msg)
lines = file_lines_to_list(txt_file)
for line in lines:
try:
tmp_class_name, confidence, left, top, right, bottom = line.split()
except:
line_split = line.split()
bottom = line_split[-1]
right = line_split[-2]
top = line_split[-3]
left = line_split[-4]
confidence = line_split[-5]
tmp_class_name = ""
for name in line_split[:-5]:
tmp_class_name += name + " "
tmp_class_name = tmp_class_name[:-1]
if tmp_class_name == class_name:
bbox = left + " " + top + " " + right + " " + bottom
bounding_boxes.append({"confidence": confidence, "file_id": file_id, "bbox": bbox})
bounding_boxes.sort(key=lambda x: float(x['confidence']), reverse=True)
with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
json.dump(bounding_boxes, outfile)
sum_AP = 0.0
ap_dictionary = {}
lamr_dictionary = {}
with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file:
results_file.write("# AP and precision/recall per class\n")
count_true_positives = {}
for class_index, class_name in enumerate(gt_classes):
count_true_positives[class_name] = 0
dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
dr_data = json.load(open(dr_file))
nd = len(dr_data)
tp = [0] * nd
fp = [0] * nd
score = [0] * nd
score_threhold_idx = 0
for idx, detection in enumerate(dr_data):
file_id = detection["file_id"]
score[idx] = float(detection["confidence"])
if score[idx] >= score_threhold:
score_threhold_idx = idx
if show_animation:
ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
if len(ground_truth_img) == 0:
error("Error. Image not found with id: " + file_id)
elif len(ground_truth_img) > 1:
error("Error. Multiple image with id: " + file_id)
else:
img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0]
if os.path.isfile(img_cumulative_path):
img_cumulative = cv2.imread(img_cumulative_path)
else:
img_cumulative = img.copy()
bottom_border = 60
BLACK = [0, 0, 0]
img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
ground_truth_data = json.load(open(gt_file))
ovmax = -1
gt_match = -1
bb = [float(x) for x in detection["bbox"].split()]
for obj in ground_truth_data:
if obj["class_name"] == class_name:
bbgt = [float(x) for x in obj["bbox"].split()]
bi = [max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), min(bb[3], bbgt[3])]
iw = bi[2] - bi[0] + 1
ih = bi[3] - bi[1] + 1
if iw > 0 and ih > 0:
ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
+ 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
ov = iw * ih / ua
if ov > ovmax:
ovmax = ov
gt_match = obj
if show_animation:
status = "NO MATCH FOUND!"
min_overlap = MINOVERLAP
if ovmax >= min_overlap:
if "difficult" not in gt_match:
if not bool(gt_match["used"]):
tp[idx] = 1
gt_match["used"] = True
count_true_positives[class_name] += 1
with open(gt_file, 'w') as f:
f.write(json.dumps(ground_truth_data))
if show_animation:
status = "MATCH!"
else:
fp[idx] = 1
if show_animation:
status = "REPEATED MATCH!"
else:
fp[idx] = 1
if ovmax > 0:
status = "INSUFFICIENT OVERLAP"
"""
Draw image to show animation
"""
if show_animation:
height, widht = img.shape[:2]
white = (255, 255, 255)
light_blue = (255, 200, 100)
green = (0, 255, 0)
light_red = (30, 30, 255)
margin = 10
# 1nd line
v_pos = int(height - margin - (bottom_border / 2.0))
text = "Image: " + ground_truth_img[0] + " "
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue,
line_width)
if ovmax != -1:
color = light_red
if status == "INSUFFICIENT OVERLAP":
text = "IoU: {0:.2f}% ".format(ovmax * 100) + "< {0:.2f}% ".format(min_overlap * 100)
else:
text = "IoU: {0:.2f}% ".format(ovmax * 100) + ">= {0:.2f}% ".format(min_overlap * 100)
color = green
img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
# 2nd line
v_pos += int(bottom_border / 2.0)
rank_pos = str(idx + 1)
text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(
float(detection["confidence"]) * 100)
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
color = light_red
if status == "MATCH!":
color = green
text = "Result: " + status + " "
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
font = cv2.FONT_HERSHEY_SIMPLEX
if ovmax > 0:
bbgt = [int(round(float(x))) for x in gt_match["bbox"].split()]
cv2.rectangle(img, (bbgt[0], bbgt[1]), (bbgt[2], bbgt[3]), light_blue, 2)
cv2.rectangle(img_cumulative, (bbgt[0], bbgt[1]), (bbgt[2], bbgt[3]), light_blue, 2)
cv2.putText(img_cumulative, class_name, (bbgt[0], bbgt[1] - 5), font, 0.6, light_blue, 1,
cv2.LINE_AA)
bb = [int(i) for i in bb]
cv2.rectangle(img, (bb[0], bb[1]), (bb[2], bb[3]), color, 2)
cv2.rectangle(img_cumulative, (bb[0], bb[1]), (bb[2], bb[3]), color, 2)
cv2.putText(img_cumulative, class_name, (bb[0], bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
cv2.imshow("Animation", img)
cv2.waitKey(20)
output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(
idx) + ".jpg"
cv2.imwrite(output_img_path, img)
cv2.imwrite(img_cumulative_path, img_cumulative)
cumsum = 0
for idx, val in enumerate(fp):
fp[idx] += cumsum
cumsum += val
cumsum = 0
for idx, val in enumerate(tp):
tp[idx] += cumsum
cumsum += val
rec = tp[:]
for idx, val in enumerate(tp):
rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1)
prec = tp[:]
for idx, val in enumerate(tp):
prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1)
ap, mrec, mprec = voc_ap(rec[:], prec[:])
F1 = np.array(rec) * np.array(prec) * 2 / np.where((np.array(prec) + np.array(rec)) == 0, 1,
(np.array(prec) + np.array(rec)))
sum_AP += ap
text = "{0:.2f}%".format(
ap * 100) + " = " + class_name + " AP " # class_name + " AP = {0:.2f}%".format(ap*100)
if len(prec) > 0:
F1_text = "{0:.2f}".format(F1[score_threhold_idx]) + " = " + class_name + " F1 "
Recall_text = "{0:.2f}%".format(rec[score_threhold_idx] * 100) + " = " + class_name + " Recall "
Precision_text = "{0:.2f}%".format(prec[score_threhold_idx] * 100) + " = " + class_name + " Precision "
else:
F1_text = "0.00" + " = " + class_name + " F1 "
Recall_text = "0.00%" + " = " + class_name + " Recall "
Precision_text = "0.00%" + " = " + class_name + " Precision "
rounded_prec = ['%.2f' % elem for elem in prec]
rounded_rec = ['%.2f' % elem for elem in rec]
results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
if len(prec) > 0:
print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=" + "{0:.2f}".format(
F1[score_threhold_idx]) \
+ " ; Recall=" + "{0:.2f}%".format(
rec[score_threhold_idx] * 100) + " ; Precision=" + "{0:.2f}%".format(
prec[score_threhold_idx] * 100))
else:
print(text + "\t||\tscore_threhold=" + str(
score_threhold) + " : " + "F1=0.00% ; Recall=0.00% ; Precision=0.00%")
ap_dictionary[class_name] = ap
n_images = counter_images_per_class[class_name]
lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images)
lamr_dictionary[class_name] = lamr
if draw_plot:
plt.plot(rec, prec, '-o')
area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
fig = plt.gcf()
fig.canvas.set_window_title('AP ' + class_name)
plt.title('class: ' + text)
plt.xlabel('Recall')
plt.ylabel('Precision')
axes = plt.gca()
axes.set_xlim([0.0, 1.0])
axes.set_ylim([0.0, 1.05])
fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png")
plt.cla()
plt.plot(score, F1, "-", color='orangered')
plt.title('class: ' + F1_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('F1')
axes = plt.gca()
axes.set_xlim([0.0, 1.0])
axes.set_ylim([0.0, 1.05])
fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png")
plt.cla()
plt.plot(score, rec, "-H", color='gold')
plt.title('class: ' + Recall_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('Recall')
axes = plt.gca()
axes.set_xlim([0.0, 1.0])
axes.set_ylim([0.0, 1.05])
fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png")
plt.cla()
plt.plot(score, prec, "-s", color='palevioletred')
plt.title('class: ' + Precision_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('Precision')
axes = plt.gca()
axes.set_xlim([0.0, 1.0])
axes.set_ylim([0.0, 1.05])
fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png")
plt.cla()
if show_animation:
cv2.destroyAllWindows()
if n_classes == 0:
print("未检测到任何种类请检查标签信息与get_map.py中的classes_path是否修改。")
return 0
results_file.write("\n# mAP of all classes\n")
mAP = sum_AP / n_classes
text = "mAP = {0:.2f}%".format(mAP * 100)
results_file.write(text + "\n")
print(text)
shutil.rmtree(TEMP_FILES_PATH)
"""
Count total of detection-results
"""
det_counter_per_class = {}
for txt_file in dr_files_list:
lines_list = file_lines_to_list(txt_file)
for line in lines_list:
class_name = line.split()[0]
if class_name in det_counter_per_class:
det_counter_per_class[class_name] += 1
else:
det_counter_per_class[class_name] = 1
dr_classes = list(det_counter_per_class.keys())
"""
Write number of ground-truth objects per class to results.txt
"""
with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
results_file.write("\n# Number of ground-truth objects per class\n")
for class_name in sorted(gt_counter_per_class):
results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
"""
Finish counting true positives
"""
for class_name in dr_classes:
if class_name not in gt_classes:
count_true_positives[class_name] = 0
"""
Write number of detected objects per class to results.txt
"""
with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
results_file.write("\n# Number of detected objects per class\n")
for class_name in sorted(dr_classes):
n_det = det_counter_per_class[class_name]
text = class_name + ": " + str(n_det)
text += " (tp:" + str(count_true_positives[class_name]) + ""
text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
results_file.write(text)
"""
Plot the total number of occurences of each class in the ground-truth
"""
if draw_plot:
window_title = "ground-truth-info"
plot_title = "ground-truth\n"
plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
x_label = "Number of objects per class"
output_path = RESULTS_FILES_PATH + "/ground-truth-info.png"
to_show = False
plot_color = 'forestgreen'
draw_plot_func(
gt_counter_per_class,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
'',
)
# """
# Plot the total number of occurences of each class in the "detection-results" folder
# """
# if draw_plot:
# window_title = "detection-results-info"
# # Plot title
# plot_title = "detection-results\n"
# plot_title += "(" + str(len(dr_files_list)) + " files and "
# count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
# plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
# # end Plot title
# x_label = "Number of objects per class"
# output_path = RESULTS_FILES_PATH + "/detection-results-info.png"
# to_show = False
# plot_color = 'forestgreen'
# true_p_bar = count_true_positives
# draw_plot_func(
# det_counter_per_class,
# len(det_counter_per_class),
# window_title,
# plot_title,
# x_label,
# output_path,
# to_show,
# plot_color,
# true_p_bar
# )
"""
Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
"""
if draw_plot:
window_title = "lamr"
plot_title = "log-average miss rate"
x_label = "log-average miss rate"
output_path = RESULTS_FILES_PATH + "/lamr.png"
to_show = False
plot_color = 'royalblue'
draw_plot_func(
lamr_dictionary,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
""
)
"""
Draw mAP plot (Show AP's of all classes in decreasing order)
"""
if draw_plot:
window_title = "mAP"
plot_title = "mAP = {0:.2f}%".format(mAP * 100)
x_label = "Average Precision"
output_path = RESULTS_FILES_PATH + "/mAP.png"
to_show = True
plot_color = 'royalblue'
draw_plot_func(
ap_dictionary,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
""
)
return mAP
def preprocess_gt(gt_path, class_names):
image_ids = os.listdir(gt_path)
results = {}
images = []
bboxes = []
for i, image_id in enumerate(image_ids):
lines_list = file_lines_to_list(os.path.join(gt_path, image_id))
boxes_per_image = []
image = {}
image_id = os.path.splitext(image_id)[0]
image['file_name'] = image_id + '.jpg'
image['width'] = 1
image['height'] = 1
# -----------------------------------------------------------------#
# 感谢 多学学英语吧 的提醒
# 解决了'Results do not correspond to current coco set'问题
# -----------------------------------------------------------------#
image['id'] = str(image_id)
for line in lines_list:
difficult = 0
if "difficult" in line:
line_split = line.split()
left, top, right, bottom, _difficult = line_split[-5:]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
difficult = 1
else:
line_split = line.split()
left, top, right, bottom = line_split[-4:]
class_name = ""
for name in line_split[:-4]:
class_name += name + " "
class_name = class_name[:-1]
left, top, right, bottom = float(left), float(top), float(right), float(bottom)
if class_name not in class_names:
continue
cls_id = class_names.index(class_name) + 1
bbox = [left, top, right - left, bottom - top, difficult, str(image_id), cls_id,
(right - left) * (bottom - top) - 10.0]
boxes_per_image.append(bbox)
images.append(image)
bboxes.extend(boxes_per_image)
results['images'] = images
categories = []
for i, cls in enumerate(class_names):
category = {}
category['supercategory'] = cls
category['name'] = cls
category['id'] = i + 1
categories.append(category)
results['categories'] = categories
annotations = []
for i, box in enumerate(bboxes):
annotation = {}
annotation['area'] = box[-1]
annotation['category_id'] = box[-2]
annotation['image_id'] = box[-3]
annotation['iscrowd'] = box[-4]
annotation['bbox'] = box[:4]
annotation['id'] = i
annotations.append(annotation)
results['annotations'] = annotations
return results
def preprocess_dr(dr_path, class_names):
image_ids = os.listdir(dr_path)
results = []
for image_id in image_ids:
lines_list = file_lines_to_list(os.path.join(dr_path, image_id))
image_id = os.path.splitext(image_id)[0]
for line in lines_list:
line_split = line.split()
confidence, left, top, right, bottom = line_split[-5:]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
left, top, right, bottom = float(left), float(top), float(right), float(bottom)
result = {}
result["image_id"] = str(image_id)
if class_name not in class_names:
continue
result["category_id"] = class_names.index(class_name) + 1
result["bbox"] = [left, top, right - left, bottom - top]
result["score"] = float(confidence)
results.append(result)
return results
def get_coco_map(class_names, path):
GT_PATH = os.path.join(path, 'ground-truth')
DR_PATH = os.path.join(path, 'detection-results')
COCO_PATH = os.path.join(path, 'coco_eval')
if not os.path.exists(COCO_PATH):
os.makedirs(COCO_PATH)
GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json')
DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json')
with open(GT_JSON_PATH, "w") as f:
results_gt = preprocess_gt(GT_PATH, class_names)
json.dump(results_gt, f, indent=4)
with open(DR_JSON_PATH, "w") as f:
results_dr = preprocess_dr(DR_PATH, class_names)
json.dump(results_dr, f, indent=4)
if len(results_dr) == 0:
print("未检测到任何目标。")
return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
cocoGt = COCO(GT_JSON_PATH)
cocoDt = cocoGt.loadRes(DR_JSON_PATH)
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
return cocoEval.stats

View File

@ -0,0 +1,117 @@
# -------------------------------------------------------#
# 用于处理COCO数据集根据json文件生成txt文件用于训练
# -------------------------------------------------------#
import json
import os
from collections import defaultdict
# -------------------------------------------------------#
# 指向了COCO训练集与验证集图片的路径
# -------------------------------------------------------#
train_datasets_path = "coco_dataset/train2017"
val_datasets_path = "coco_dataset/val2017"
# -------------------------------------------------------#
# 指向了COCO训练集与验证集标签的路径
# -------------------------------------------------------#
train_annotation_path = "coco_dataset/annotations/instances_train2017.json"
val_annotation_path = "coco_dataset/annotations/instances_val2017.json"
# -------------------------------------------------------#
# 生成的txt文件路径
# -------------------------------------------------------#
train_output_path = "coco_train.txt"
val_output_path = "coco_val.txt"
if __name__ == "__main__":
name_box_id = defaultdict(list)
id_name = dict()
f = open(train_annotation_path, encoding='utf-8')
data = json.load(f)
annotations = data['annotations']
for ant in annotations:
id = ant['image_id']
name = os.path.join(train_datasets_path, '%012d.jpg' % id)
cat = ant['category_id']
if cat >= 1 and cat <= 11:
cat = cat - 1
elif cat >= 13 and cat <= 25:
cat = cat - 2
elif cat >= 27 and cat <= 28:
cat = cat - 3
elif cat >= 31 and cat <= 44:
cat = cat - 5
elif cat >= 46 and cat <= 65:
cat = cat - 6
elif cat == 67:
cat = cat - 7
elif cat == 70:
cat = cat - 9
elif cat >= 72 and cat <= 82:
cat = cat - 10
elif cat >= 84 and cat <= 90:
cat = cat - 11
name_box_id[name].append([ant['bbox'], cat])
f = open(train_output_path, 'w')
for key in name_box_id.keys():
f.write(key)
box_infos = name_box_id[key]
for info in box_infos:
x_min = int(info[0][0])
y_min = int(info[0][1])
x_max = x_min + int(info[0][2])
y_max = y_min + int(info[0][3])
box_info = " %d,%d,%d,%d,%d" % (
x_min, y_min, x_max, y_max, int(info[1]))
f.write(box_info)
f.write('\n')
f.close()
name_box_id = defaultdict(list)
id_name = dict()
f = open(val_annotation_path, encoding='utf-8')
data = json.load(f)
annotations = data['annotations']
for ant in annotations:
id = ant['image_id']
name = os.path.join(val_datasets_path, '%012d.jpg' % id)
cat = ant['category_id']
if cat >= 1 and cat <= 11:
cat = cat - 1
elif cat >= 13 and cat <= 25:
cat = cat - 2
elif cat >= 27 and cat <= 28:
cat = cat - 3
elif cat >= 31 and cat <= 44:
cat = cat - 5
elif cat >= 46 and cat <= 65:
cat = cat - 6
elif cat == 67:
cat = cat - 7
elif cat == 70:
cat = cat - 9
elif cat >= 72 and cat <= 82:
cat = cat - 10
elif cat >= 84 and cat <= 90:
cat = cat - 11
name_box_id[name].append([ant['bbox'], cat])
f = open(val_output_path, 'w')
for key in name_box_id.keys():
f.write(key)
box_infos = name_box_id[key]
for info in box_infos:
x_min = int(info[0][0])
y_min = int(info[0][1])
x_max = x_min + int(info[0][2])
y_max = y_min + int(info[0][3])
box_info = " %d,%d,%d,%d,%d" % (
x_min, y_min, x_max, y_max, int(info[1]))
f.write(box_info)
f.write('\n')
f.close()

116
utils_coco/get_map_coco.py Normal file
View File

@ -0,0 +1,116 @@
import json
import os
import numpy as np
import torch
from PIL import Image
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tqdm import tqdm
from utils.utils import cvtColor, preprocess_input, resize_image
from yolo import YOLO
# ---------------------------------------------------------------------------#
# map_mode用于指定该文件运行时计算的内容
# map_mode为0代表整个map计算流程包括获得预测结果、计算map。
# map_mode为1代表仅仅获得预测结果。
# map_mode为2代表仅仅获得计算map。
# ---------------------------------------------------------------------------#
map_mode = 0
# -------------------------------------------------------#
# 指向了验证集标签与图片路径
# -------------------------------------------------------#
cocoGt_path = 'coco_dataset/annotations/instances_val2017.json'
dataset_img_path = 'coco_dataset/val2017'
# -------------------------------------------------------#
# 结果输出的文件夹默认为map_out
# -------------------------------------------------------#
temp_save_path = 'map_out/coco_eval'
class mAP_YOLO(YOLO):
# ---------------------------------------------------#
# 检测图片
# ---------------------------------------------------#
def detect_image(self, image_id, image, results):
# ---------------------------------------------------#
# 计算输入图片的高和宽
# ---------------------------------------------------#
image_shape = np.array(np.shape(image)[0:2])
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
outputs = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
if outputs[0] is None:
return results
top_label = np.array(outputs[0][:, 6], dtype='int32')
top_conf = outputs[0][:, 4] * outputs[0][:, 5]
top_boxes = outputs[0][:, :4]
for i, c in enumerate(top_label):
result = {}
top, left, bottom, right = top_boxes[i]
result["image_id"] = int(image_id)
result["category_id"] = clsid2catid[c]
result["bbox"] = [float(left), float(top), float(right - left), float(bottom - top)]
result["score"] = float(top_conf[i])
results.append(result)
return results
if __name__ == "__main__":
if not os.path.exists(temp_save_path):
os.makedirs(temp_save_path)
cocoGt = COCO(cocoGt_path)
ids = list(cocoGt.imgToAnns.keys())
clsid2catid = cocoGt.getCatIds()
if map_mode == 0 or map_mode == 1:
yolo = mAP_YOLO(confidence=0.001, nms_iou=0.65)
with open(os.path.join(temp_save_path, 'eval_results.json'), "w") as f:
results = []
for image_id in tqdm(ids):
image_path = os.path.join(dataset_img_path, cocoGt.loadImgs(image_id)[0]['file_name'])
image = Image.open(image_path)
results = yolo.detect_image(image_id, image, results)
json.dump(results, f)
if map_mode == 0 or map_mode == 2:
cocoDt = cocoGt.loadRes(os.path.join(temp_save_path, 'eval_results.json'))
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
print("Get map done.")

158
voc_annotation.py Normal file
View File

@ -0,0 +1,158 @@
import os
import random
import xml.etree.ElementTree as ET
import numpy as np
from utils.utils import get_classes
# --------------------------------------------------------------------------------------------------------------------------------#
# annotation_mode用于指定该文件运行时计算的内容
# annotation_mode为0代表整个标签处理过程包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
# annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
# annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
# --------------------------------------------------------------------------------------------------------------------------------#
annotation_mode = 0
# -------------------------------------------------------------------#
# 必须要修改用于生成2007_train.txt、2007_val.txt的目标信息
# 与训练和预测所用的classes_path一致即可
# 如果生成的2007_train.txt里面没有目标信息
# 那么就是因为classes没有设定正确
# 仅在annotation_mode为0和2的时候有效
# -------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt' # 这里定义的名字是xml的物体的名字出现的顺序是训练时的onehot顺序。
# --------------------------------------------------------------------------------------------------------------------------------#
# trainval_percent用于指定(训练集+验证集)与测试集的比例,默认情况下 (训练集+验证集):测试集 = 9:1
# train_percent用于指定(训练集+验证集)中训练集与验证集的比例,默认情况下 训练集:验证集 = 9:1
# 仅在annotation_mode为0和1的时候有效
# --------------------------------------------------------------------------------------------------------------------------------#
trainval_percent = 0.9
train_percent = 0.9
# -------------------------------------------------------#
# 指向VOC数据集所在的文件夹
# 默认指向根目录下的VOC数据集
# -------------------------------------------------------#
VOCdevkit_path = 'VOCdevkit'
VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
classes, _ = get_classes(classes_path)
# -------------------------------------------------------#
# 统计目标数量
# -------------------------------------------------------#
photo_nums = np.zeros(len(VOCdevkit_sets)) # 生成train的数目val的数目
nums = np.zeros(len(classes)) # 统计各个类别的数量
def convert_annotation(year, image_id, list_file):
in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml' % (year, image_id)), encoding='utf-8') # 'VOCdevkit\\VOC2007/Annotations/000001.xml'
tree = ET.parse(in_file)
root = tree.getroot()
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult') != None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1: # 不在classes里或者difficult为1跳过当前类别
continue
cls_id = classes.index(cls) # 类别对应于classes文件的下标是类别的id属性
xmlbox = obj.find('bndbox')
b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)),
int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
# list_file的每一行前面先写了图片的全路径接着一个空格依次写各个物体的 以,分隔的坐标和id
nums[classes.index(cls)] = nums[classes.index(cls)] + 1 # 统计各个类别的个数
if __name__ == "__main__":
random.seed(0)
if " " in os.path.abspath(VOCdevkit_path):
raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格,否则会影响正常的模型训练,请注意修改。")
if annotation_mode == 0 or annotation_mode == 1:
print("Generate txt in ImageSets.")
xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
temp_xml = os.listdir(xmlfilepath)
total_xml = [xml for xml in temp_xml if xml.endswith(".xml")]
num = len(total_xml) # 取得原始数据集中的总数,从总数中划分数据集
list = range(num)
tv = int(num * trainval_percent) # 训练+验证集 总数
tr = int(tv * train_percent) # 训练+验证集中 训练集的总数
trainval = random.sample(list, tv) # 在总数里采样
train = random.sample(trainval, tr) # 在tv中采样tr
print("train and val size", tv)
print("train size", tr)
ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
for i in list:
name = total_xml[i][:-4] + '\n' # 取出除了后缀的文件名字
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest.close()
print("Generate txt in ImageSets done.")
if annotation_mode == 0 or annotation_mode == 2:
print("Generate 2007_train.txt and 2007_val.txt for train.")
type_index = 0
for year, image_set in VOCdevkit_sets:
image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt' % (year, image_set)), # 'VOCdevkit\\VOC2007/ImageSets/Main/train.txt'
encoding='utf-8').read().strip().split()
list_file = open('%s_%s.txt' % (year, image_set), 'w', encoding='utf-8') # '2007_train.txt'
for image_id in image_ids:
list_file.write( # 'C:\\my_code\\a_python\\YOLO_all\\yolo_v3\\VOCdevkit/VOC2007/JPEGImages/000001.jpg'
'%s/VOC%s/JPEGImages/%s.jpg' % (os.path.abspath(VOCdevkit_path), year, image_id)) # 文件全路径名字是拼出来的
convert_annotation(year, image_id, list_file)
list_file.write('\n')
photo_nums[type_index] = len(image_ids) # 记录训练集总数和验证集总数
type_index += 1 # 用来标记是操作 训练集还是验证集
list_file.close()
print("Generate 2007_train.txt and 2007_val.txt for train done.")
def printTable(List1, List2):
# for i in range(len(List1[0])):
for i, _ in enumerate(List1[0]):
print("|", end=' ')
for j in range(len(List1)): # len(List1)为2
print(List1[j][i].rjust(int(List2[j])), end=' ')
print("|", end=' ')
print()
str_nums = [str(int(x)) for x in nums] # 每个类别的数目
tableData = [
classes, str_nums # 类别与数目对应
]
colWidths = [0] * len(tableData) # 计算列宽共有len(tableData)列这里是2
len1 = 0
for i in range(len(tableData)):
for j in range(len(tableData[i])):
if len(tableData[i][j]) > colWidths[i]:
colWidths[i] = len(tableData[i][j]) # 每列中每个元素的最大长度赋值给colWidths
printTable(tableData, colWidths)
if photo_nums[0] <= 500:
print("训练集数量小于500属于较小的数据量请注意设置较大的训练世代Epoch以满足足够的梯度下降次数Step")
if np.sum(nums) == 0:
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("(重要的事情说三遍)。")

41
webcam.py Normal file
View File

@ -0,0 +1,41 @@
import time
import cv2
import numpy as np
from PIL import Image
from yolo import YOLO
yolo = YOLO()
capture = cv2.VideoCapture(0)
# 1 就是外接摄像头 0 就是自己的摄像头
ref, frame = capture.read()
fps = 0.0
while (True):
t1 = time.time()
# 读取某一帧
ref, frame = capture.read()
if not ref:
break
# 格式转变BGRtoRGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 转变成Image
frame = Image.fromarray(np.uint8(frame))
# 进行检测
frame = np.array(yolo.detect_image(frame))
# RGBtoBGR满足opencv显示格式
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
fps = (fps + (1. / (time.time() - t1))) / 2
# print("fps= %.2f" % (fps))
frame = cv2.putText(frame, "fps= %.2f" % (fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("video", frame)
c = cv2.waitKey(1) & 0xff
# print(c)
if c == 113:
capture.release()
break
capture.release()
cv2.destroyAllWindows()

425
yolo.py Normal file
View File

@ -0,0 +1,425 @@
import colorsys
import os
import time
import numpy as np
import torch
import torch.nn as nn
from PIL import ImageDraw, ImageFont
from nets.yolo import YoloBody
from utils.utils import (cvtColor, get_anchors, get_classes, preprocess_input,
resize_image, show_config)
from utils.utils_bbox import DecodeBox
'''
训练自己的数据集必看注释
'''
class YOLO(object):
_defaults = {
# --------------------------------------------------------------------------#
# 使用自己训练好的模型进行预测一定要修改model_path和classes_path
# model_path指向logs文件夹下的权值文件classes_path指向model_data下的txt
#
# 训练好后logs文件夹下存在多个权值文件选择验证集损失较低的即可。
# 验证集损失较低不代表mAP较高仅代表该权值在验证集上泛化性能较好。
# 如果出现shape不匹配同时要注意训练时的model_path和classes_path参数的修改
# --------------------------------------------------------------------------#
# "model_path": 'model_data/yolo_weights.pth',
# "classes_path": 'model_data/coco_classes.txt',
"model_path": 'logs/best_epoch_weights.pth',
"classes_path": 'model_data/cctsdb_classes.txt',
# ---------------------------------------------------------------------#
# anchors_path代表先验框对应的txt文件一般不修改。
# anchors_mask用于帮助代码找到对应的先验框一般不修改。
# ---------------------------------------------------------------------#
"anchors_path": 'model_data/yolo_anchors.txt',
"anchors_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
# ---------------------------------------------------------------------#
# 输入图片的大小必须为32的倍数。
# ---------------------------------------------------------------------#
"input_shape": [416, 416],
# ---------------------------------------------------------------------#
# 只有得分大于置信度的预测框会被保留下来
# ---------------------------------------------------------------------#
"confidence": 0.5,
# ---------------------------------------------------------------------#
# 非极大抑制所用到的nms_iou大小
# ---------------------------------------------------------------------#
"nms_iou": 0.3,
# ---------------------------------------------------------------------#
# 该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize
# 在多次测试后发现关闭letterbox_image直接resize的效果更好
# ---------------------------------------------------------------------#
"letterbox_image": False,
# -------------------------------#
# 是否使用Cuda
# 没有GPU可以设置成False
# -------------------------------#
"cuda": True
}
@classmethod
def get_defaults(cls, n):
if n in cls._defaults:
return cls._defaults[n]
else:
return "Unrecognized attribute name '" + n + "'"
# ---------------------------------------------------#
# 初始化YOLO
# ---------------------------------------------------#
def __init__(self, **kwargs):
self.__dict__.update(self._defaults) # 用类的_defaults变量更新当前对象的属性字典
for name, value in kwargs.items():
setattr(self, name, value)
# ---------------------------------------------------#
# 获得种类和先验框的数量
# ---------------------------------------------------#
self.class_names, self.num_classes = get_classes(self.classes_path)
self.anchors, self.num_anchors = get_anchors(self.anchors_path)
self.bbox_util = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]),
self.anchors_mask)
# ---------------------------------------------------#
# 画框设置不同的颜色
# ---------------------------------------------------#
hsv_tuples = [(x / self.num_classes, 1., 1.) for x in range(self.num_classes)]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors))
self.generate()
show_config(**self._defaults)
# ---------------------------------------------------#
# 生成模型
# ---------------------------------------------------#
def generate(self, onnx=False):
# ---------------------------------------------------#
# 建立yolov3模型载入yolov3模型的权重
# ---------------------------------------------------#
self.net = YoloBody(self.anchors_mask, self.num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.net.load_state_dict(torch.load(self.model_path, map_location=device))
self.net = self.net.eval()
print('{} model, anchors, and classes loaded.'.format(self.model_path))
# if not onnx:
# if self.cuda:
# self.net = nn.DataParallel(self.net)
# self.net = self.net.cuda()
if not onnx:
if self.cuda:
self.net = self.net.cuda()
# ---------------------------------------------------#
# 检测图片
# ---------------------------------------------------#
def detect_image(self, image, crop=False, count=False):
image_shape = np.array(np.shape(image)[0:2]) # np.shape(image) 的形状 h,w,c
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
# image_data 变换后的维度是 1, 3, 416, 416
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
if results[0] is None:
return image
top_label = np.array(results[0][:, 6], dtype='int32')
top_conf = results[0][:, 4] * results[0][:, 5]
top_boxes = results[0][:, :4]
# ---------------------------------------------------------#
# 设置字体与边框厚度
# ---------------------------------------------------------#
font = ImageFont.truetype(font='model_data/simhei.ttf',
size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
thickness = int(max((image.size[0] + image.size[1]) // np.mean(self.input_shape), 1))
# ---------------------------------------------------------#
# 计数
# ---------------------------------------------------------#
if count:
print("top_label:", top_label)
classes_nums = np.zeros([self.num_classes])
for i in range(self.num_classes):
num = np.sum(top_label == i)
if num > 0:
print(self.class_names[i], " : ", num)
classes_nums[i] = num
print("classes_nums:", classes_nums)
# ---------------------------------------------------------#
# 是否进行目标的裁剪
# ---------------------------------------------------------#
if crop:
for i, c in list(enumerate(top_label)):
top, left, bottom, right = top_boxes[i]
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
dir_save_path = "img_crop"
if not os.path.exists(dir_save_path):
os.makedirs(dir_save_path)
crop_image = image.crop([left, top, right, bottom])
crop_image.save(os.path.join(dir_save_path, "crop_" + str(i) + ".png"), quality=95, subsampling=0)
print("save crop_" + str(i) + ".png to " + dir_save_path)
# ---------------------------------------------------------#
# 图像绘制
# ---------------------------------------------------------#
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = top_conf[i]
top, left, bottom, right = box
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
# print(label, top, left, bottom, right)
if top - label_size[1] >= 0: # 框到顶的距离大于 label_size就是可以在顶部放标签
text_origin = np.array([left, top - label_size[1]])
else: # 否则放在框内部
text_origin = np.array([left, top + 1])
for i in range(thickness): # 画粗细的实现是画6次
draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c])
draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c])
draw.text(text_origin, str(label, 'UTF-8'), fill=(0, 0, 0), font=font)
del draw
return image
def get_FPS(self, image, test_interval):
image_shape = np.array(np.shape(image)[0:2])
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
t1 = time.time()
for _ in range(test_interval):
with torch.no_grad():
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image,
conf_thres=self.confidence, nms_thres=self.nms_iou)
t2 = time.time()
tact_time = (t2 - t1) / test_interval
return tact_time
def detect_heatmap(self, image, heatmap_save_path):
import cv2
import matplotlib.pyplot as plt
def sigmoid(x):
y = 1.0 / (1.0 + np.exp(-x))
return y
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
plt.imshow(image, alpha=1)
plt.axis('off')
mask = np.zeros((image.size[1], image.size[0]))
for sub_output in outputs:
sub_output = sub_output.cpu().numpy()
b, c, h, w = np.shape(sub_output)
sub_output = np.transpose(np.reshape(sub_output, [b, 3, -1, h, w]), [0, 3, 4, 1, 2])[0]
score = np.max(sigmoid(sub_output[..., 4]), -1)
score = cv2.resize(score, (image.size[0], image.size[1]))
normed_score = (score * 255).astype('uint8')
mask = np.maximum(mask, normed_score)
plt.imshow(mask, alpha=0.5, interpolation='nearest', cmap="jet")
plt.axis('off')
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
plt.margins(0, 0)
plt.savefig(heatmap_save_path, dpi=200, bbox_inches='tight', pad_inches=-0.1)
print("Save to the " + heatmap_save_path)
plt.show()
def convert_to_onnx(self, simplify, model_path):
import onnx
self.generate(onnx=True)
im = torch.zeros(1, 3, *self.input_shape).to('cpu') # image size(1, 3, 512, 512) BCHW
input_layer_names = ["images"]
output_layer_names = ["output"]
# Export the model
print(f'Starting export with onnx {onnx.__version__}.')
torch.onnx.export(self.net,
im,
f=model_path,
verbose=False,
opset_version=12,
training=torch.onnx.TrainingMode.EVAL,
do_constant_folding=True,
input_names=input_layer_names,
output_names=output_layer_names,
dynamic_axes=None)
# Checks
model_onnx = onnx.load(model_path) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
# Simplify onnx
if simplify:
import onnxsim
print(f'Simplifying with onnx-simplifier {onnxsim.__version__}.')
model_onnx, check = onnxsim.simplify(
model_onnx,
dynamic_input_shape=False,
input_shapes=None)
assert check, 'assert check failed'
onnx.save(model_onnx, model_path)
print('Onnx model save as {}'.format(model_path))
def get_map_txt(self, image_id, image, class_names, map_out_path):
f = open(os.path.join(map_out_path, "detection-results/" + image_id + ".txt"), "w")
image_shape = np.array(np.shape(image)[0:2])
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
if results[0] is None:
return
top_label = np.array(results[0][:, 6], dtype='int32')
top_conf = results[0][:, 4] * results[0][:, 5]
top_boxes = results[0][:, :4]
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = str(top_conf[i])
top, left, bottom, right = box
if predicted_class not in class_names:
continue
f.write("%s %s %s %s %s %s\n" % (
predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)), str(int(bottom))))
f.close()
return