first

2024-07-04 17:03:29 +08:00 · 2024-07-04 17:03:29 +08:00 · 4a2986bde5
commit 4a2986bde5
30 changed files with 5342 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,140 @@
+# ignore map, miou, datasets
+map_out/
+miou_out/
+VOCdevkit/
+datasets/
+Medical_Datasets/
+lfw/
+logs/
+model_data/
+.temp_map_out/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/Dataset_Partition.py
+++ b/Dataset_Partition.py
@ -0,0 +1,161 @@
+import os
+import random
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+from utils.utils import get_classes
+
+# --------------------------------------------------------------------------------------------------------------------------------#
+#   annotation_mode用于指定该文件运行时计算的内容
+#   annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
+#   annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
+#   annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
+# --------------------------------------------------------------------------------------------------------------------------------#
+annotation_mode = 0
+# -------------------------------------------------------------------#
+#   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
+#   与训练和预测所用的classes_path一致即可
+#   如果生成的2007_train.txt里面没有目标信息
+#   那么就是因为classes没有设定正确
+#   仅在annotation_mode为0和2的时候有效
+# -------------------------------------------------------------------#
+classes_path = 'model_data/voc_classes.txt'
+# --------------------------------------------------------------------------------------------------------------------------------#
+#   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1
+#   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1
+#   仅在annotation_mode为0和1的时候有效
+# --------------------------------------------------------------------------------------------------------------------------------#
+trainval_percent = 0.9
+train_percent = 0.9
+# -------------------------------------------------------#
+#   指向VOC数据集所在的文件夹
+#   默认指向根目录下的VOC数据集
+# -------------------------------------------------------#
+VOCdevkit_path = 'VOCdevkit'
+
+VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
+classes, _ = get_classes(classes_path)
+
+# -------------------------------------------------------#
+#   统计目标数量
+# -------------------------------------------------------#
+photo_nums = np.zeros(len(VOCdevkit_sets))  # 生成train的数目，val的数目
+nums = np.zeros(len(classes))
+
+
+def convert_annotation(year, image_id, list_file):
+    in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml' % (year, image_id)), encoding='utf-8')
+    tree = ET.parse(in_file)
+    root = tree.getroot()
+
+    for obj in root.iter('object'):
+        difficult = 0
+        if obj.find('difficult') != None:
+            difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult) == 1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)),
+             int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
+        list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
+
+        nums[classes.index(cls)] = nums[classes.index(cls)] + 1  # 统计各个类别的个数
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    if " " in os.path.abspath(VOCdevkit_path):
+        raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格，否则会影响正常的模型训练，请注意修改。")
+
+    if annotation_mode == 0 or annotation_mode == 1:
+        print("Generate txt in ImageSets.")
+        xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
+        saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets')
+        temp_xml = os.listdir(xmlfilepath)
+        total_xml = []
+        for xml in temp_xml:
+            if xml.endswith(".xml"):
+                total_xml.append(xml)
+
+        num = len(total_xml)
+        list = range(num)
+        tv = int(num * trainval_percent)  # 训练、验证集 总数
+        tr = int(tv * train_percent)  # 训练、验证集中 训练集的总数
+        trainval = random.sample(list, tv)  # 在总数里采样
+        train = random.sample(trainval, tr)  # 在tv中采样tr
+
+        print("train and val size", tv)
+        print("train size", tr)
+        ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
+        ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
+        ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
+        fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
+
+        for i in list:
+            name = total_xml[i][:-4] + '\n'
+            if i in trainval:
+                ftrainval.write(name)
+                if i in train:
+                    ftrain.write(name)
+                else:
+                    fval.write(name)
+            else:
+                ftest.write(name)
+
+        ftrainval.close()
+        ftrain.close()
+        fval.close()
+        ftest.close()
+        print("Generate txt in ImageSets done.")
+
+    if annotation_mode == 0 or annotation_mode == 2:
+        print("Generate 2007_train.txt and 2007_val.txt for train.")
+        type_index = 0
+        for year, image_set in VOCdevkit_sets:
+            image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt' % (year, image_set)),
+                             encoding='utf-8').read().strip().split()
+            list_file = open('%s_%s.txt' % (year, image_set), 'w', encoding='utf-8')
+            for image_id in image_ids:
+                list_file.write(
+                    '%s/VOC%s/JPEGImages/%s.jpg' % (os.path.abspath(VOCdevkit_path), year, image_id))  # 文件名字是拼出来的
+
+                convert_annotation(year, image_id, list_file)
+                list_file.write('\n')
+            photo_nums[type_index] = len(image_ids)
+            type_index += 1
+            list_file.close()
+        print("Generate 2007_train.txt and 2007_val.txt for train done.")
+
+
+        def printTable(List1, List2):
+            for i in range(len(List1[0])):
+                print("|", end=' ')
+                for j in range(len(List1)):
+                    print(List1[j][i].rjust(int(List2[j])), end=' ')
+                    print("|", end=' ')
+                print()
+
+
+        str_nums = [str(int(x)) for x in nums]
+        tableData = [
+            classes, str_nums
+        ]
+        colWidths = [0] * len(tableData)
+        len1 = 0
+        for i in range(len(tableData)):
+            for j in range(len(tableData[i])):
+                if len(tableData[i][j]) > colWidths[i]:
+                    colWidths[i] = len(tableData[i][j])
+        printTable(tableData, colWidths)
+
+        if photo_nums[0] <= 500:
+            print("训练集数量小于500，属于较小的数据量，请注意设置较大的训练世代（Epoch）以满足足够的梯度下降次数（Step）。")
+
+        if np.sum(nums) == 0:
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("（重要的事情说三遍）。")
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 JiaQi Xu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/ad_train.py
+++ b/ad_train.py
@ -0,0 +1,46 @@
+from torch import optim
+
+
+class BaseConfig(object):
+    """
+    Default parameters for all config files.
+    """
+
+    def __init__(self):
+        """
+        Set the defaults.
+        """
+        self.img_dir = "inria/Train/pos"
+        self.lab_dir = "inria/Train/pos/yolo-labels"
+        self.cfgfile = "cfg/yolo.cfg"
+        self.weightfile = "weights/yolo.weights"
+        self.printfile = "non_printability/30values.txt"
+        self.patch_size = 300
+
+        self.start_learning_rate = 0.03
+
+        self.patch_name = 'base'
+
+        self.scheduler_factory = lambda x: optim.lr_scheduler.ReduceLROnPlateau(x, 'min', patience=50)
+        self.max_tv = 0
+
+        self.batch_size = 20
+
+        self.loss_target = lambda obj, cls: obj * cls
+
+
+class ReproducePaperObj(BaseConfig):
+    """
+    Reproduce the results from the paper: Generate a patch that minimises object score.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.batch_size = 8
+        self.patch_size = 300
+
+        self.patch_name = 'ObjectOnlyPaper'
+        self.max_tv = 0.165
+
+        self.loss_target = lambda obj, cls: obj
--- a/get_map.py
+++ b/get_map.py
@ -0,0 +1,138 @@
+import os
+import xml.etree.ElementTree as ET
+
+from PIL import Image
+from tqdm import tqdm
+
+from utils.utils import get_classes
+from utils.utils_map import get_coco_map, get_map
+from yolo import YOLO
+
+if __name__ == "__main__":
+    '''
+    Recall和Precision不像AP是一个面积的概念，因此在门限值（Confidence）不同时，网络的Recall和Precision值是不同的。
+    默认情况下，本代码计算的Recall和Precision代表的是当门限值（Confidence）为0.5时，所对应的Recall和Precision值。
+
+    受到mAP计算原理的限制，网络在计算mAP时需要获得近乎所有的预测框，这样才可以计算不同门限条件下的Recall和Precision值
+    因此，本代码获得的map_out/detection-results/里面的txt的框的数量一般会比直接predict多一些，目的是列出所有可能的预测框，
+    '''
+    # ------------------------------------------------------------------------------------------------------------------#
+    #   map_mode用于指定该文件运行时计算的内容
+    #   map_mode为0代表整个map计算流程，包括获得预测结果、获得真实框、计算VOC_map。
+    #   map_mode为1代表仅仅获得预测结果。
+    #   map_mode为2代表仅仅获得真实框。
+    #   map_mode为3代表仅仅计算VOC_map。
+    #   map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行
+    # -------------------------------------------------------------------------------------------------------------------#
+    map_mode = 0
+    # --------------------------------------------------------------------------------------#
+    #   此处的classes_path用于指定需要测量VOC_map的类别
+    #   一般情况下与训练和预测所用的classes_path一致即可
+    # --------------------------------------------------------------------------------------#
+    classes_path = 'model_data/voc_classes.txt'
+    # --------------------------------------------------------------------------------------#
+    #   MINOVERLAP用于指定想要获得的mAP0.x，mAP0.x的意义是什么请同学们百度一下。
+    #   比如计算mAP0.75，可以设定MINOVERLAP = 0.75。
+    #
+    #   当某一预测框与真实框重合度大于MINOVERLAP时，该预测框被认为是正样本，否则为负样本。
+    #   因此MINOVERLAP的值越大，预测框要预测的越准确才能被认为是正样本，此时算出来的mAP值越低，
+    # --------------------------------------------------------------------------------------#
+    MINOVERLAP = 0.5
+    # --------------------------------------------------------------------------------------#
+    #   受到mAP计算原理的限制，网络在计算mAP时需要获得近乎所有的预测框，这样才可以计算mAP
+    #   因此，confidence的值应当设置的尽量小进而获得全部可能的预测框。
+    #   
+    #   该值一般不调整。因为计算mAP需要获得近乎所有的预测框，此处的confidence不能随便更改。
+    #   想要获得不同门限值下的Recall和Precision值，请修改下方的score_threhold。
+    # --------------------------------------------------------------------------------------#
+    confidence = 0.001
+    # --------------------------------------------------------------------------------------#
+    #   预测时使用到的非极大抑制值的大小，越大表示非极大抑制越不严格。
+    #   
+    #   该值一般不调整。
+    # --------------------------------------------------------------------------------------#
+    nms_iou = 0.5
+    # ---------------------------------------------------------------------------------------------------------------#
+    #   Recall和Precision不像AP是一个面积的概念，因此在门限值不同时，网络的Recall和Precision值是不同的。
+    #   
+    #   默认情况下，本代码计算的Recall和Precision代表的是当门限值为0.5（此处定义为score_threhold）时所对应的Recall和Precision值。
+    #   因为计算mAP需要获得近乎所有的预测框，上面定义的confidence不能随便更改。
+    #   这里专门定义一个score_threhold用于代表门限值，进而在计算mAP时找到门限值对应的Recall和Precision值。
+    # ---------------------------------------------------------------------------------------------------------------#
+    score_threhold = 0.5
+    # -------------------------------------------------------#
+    #   map_vis用于指定是否开启VOC_map计算的可视化
+    # -------------------------------------------------------#
+    map_vis = False
+    # -------------------------------------------------------#
+    #   指向VOC数据集所在的文件夹
+    #   默认指向根目录下的VOC数据集
+    # -------------------------------------------------------#
+    VOCdevkit_path = 'VOCdevkit'
+    # -------------------------------------------------------#
+    #   结果输出的文件夹，默认为map_out
+    # -------------------------------------------------------#
+    map_out_path = 'map_out'
+
+    image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split()
+
+    if not os.path.exists(map_out_path):
+        os.makedirs(map_out_path)
+    if not os.path.exists(os.path.join(map_out_path, 'ground-truth')):
+        os.makedirs(os.path.join(map_out_path, 'ground-truth'))
+    if not os.path.exists(os.path.join(map_out_path, 'detection-results')):
+        os.makedirs(os.path.join(map_out_path, 'detection-results'))
+    if not os.path.exists(os.path.join(map_out_path, 'images-optional')):
+        os.makedirs(os.path.join(map_out_path, 'images-optional'))
+
+    class_names, _ = get_classes(classes_path)
+
+    if map_mode == 0 or map_mode == 1:
+        print("Load model.")
+        yolo = YOLO(confidence=confidence, nms_iou=nms_iou)
+        print("Load model done.")
+
+        print("Get predict result.")
+        for image_id in tqdm(image_ids):
+            image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/" + image_id + ".jpg")
+            image = Image.open(image_path)
+            if map_vis:
+                image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg"))
+            yolo.get_map_txt(image_id, image, class_names, map_out_path)
+        print("Get predict result done.")
+
+    if map_mode == 0 or map_mode == 2:
+        print("Get ground truth result.")
+        for image_id in tqdm(image_ids):
+            with open(os.path.join(map_out_path, "ground-truth/" + image_id + ".txt"), "w") as new_f:
+                root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/" + image_id + ".xml")).getroot()
+                for obj in root.findall('object'):
+                    difficult_flag = False
+                    if obj.find('difficult') != None:
+                        difficult = obj.find('difficult').text
+                        if int(difficult) == 1:
+                            difficult_flag = True
+                    obj_name = obj.find('name').text
+                    if obj_name not in class_names:
+                        continue
+                    bndbox = obj.find('bndbox')
+                    left = bndbox.find('xmin').text
+                    top = bndbox.find('ymin').text
+                    right = bndbox.find('xmax').text
+                    bottom = bndbox.find('ymax').text
+
+                    if difficult_flag:
+                        new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom))
+                    else:
+                        new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
+        print("Get ground truth result done.")
+
+    if map_mode == 0 or map_mode == 3:
+        print("Get map.")
+        get_map(MINOVERLAP, True, score_threhold=score_threhold, path=map_out_path)
+        print("Get map done.")
+
+    if map_mode == 4:
+        print("Get map.")
+        get_coco_map(class_names=class_names, path=map_out_path)
+        print("Get map done.")
--- a/kmeans_for_anchors.py
+++ b/kmeans_for_anchors.py
@ -0,0 +1,167 @@
+# -------------------------------------------------------------------------------------------------------#
+#   kmeans虽然会对数据集中的框进行聚类，但是很多数据集由于框的大小相近，聚类出来的9个框相差不大，
+#   这样的框反而不利于模型的训练。因为不同的特征层适合不同大小的先验框，shape越小的特征层适合越大的先验框
+#   原始网络的先验框已经按大中小比例分配好了，不进行聚类也会有非常好的效果。
+# -------------------------------------------------------------------------------------------------------#
+import glob
+import xml.etree.ElementTree as ET
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm import tqdm
+
+
+def cas_iou(box, cluster):
+    x = np.minimum(cluster[:, 0], box[0])
+    y = np.minimum(cluster[:, 1], box[1])
+
+    intersection = x * y
+    area1 = box[0] * box[1]
+
+    area2 = cluster[:, 0] * cluster[:, 1]
+    iou = intersection / (area1 + area2 - intersection)
+
+    return iou
+
+
+def avg_iou(box, cluster):
+    return np.mean([np.max(cas_iou(box[i], cluster)) for i in range(box.shape[0])])
+
+
+def kmeans(box, k):
+    # -------------------------------------------------------------#
+    #   取出一共有多少框
+    # -------------------------------------------------------------#
+    row = box.shape[0]
+
+    # -------------------------------------------------------------#
+    #   每个框各个点的位置
+    # -------------------------------------------------------------#
+    distance = np.empty((row, k))
+
+    # -------------------------------------------------------------#
+    #   最后的聚类位置
+    # -------------------------------------------------------------#
+    last_clu = np.zeros((row,))
+
+    np.random.seed()
+
+    # -------------------------------------------------------------#
+    #   随机选5个当聚类中心
+    # -------------------------------------------------------------#
+    cluster = box[np.random.choice(row, k, replace=False)]
+
+    iter = 0
+    while True:
+        # -------------------------------------------------------------#
+        #   计算当前框和先验框的宽高比例
+        # -------------------------------------------------------------#
+        for i in range(row):
+            distance[i] = 1 - cas_iou(box[i], cluster)
+
+        # -------------------------------------------------------------#
+        #   取出最小点
+        # -------------------------------------------------------------#
+        near = np.argmin(distance, axis=1)
+
+        if (last_clu == near).all():
+            break
+
+        # -------------------------------------------------------------#
+        #   求每一个类的中位点
+        # -------------------------------------------------------------#
+        for j in range(k):
+            cluster[j] = np.median(
+                box[near == j], axis=0)
+
+        last_clu = near
+        if iter % 5 == 0:
+            print('iter: {:d}. avg_iou:{:.2f}'.format(iter, avg_iou(box, cluster)))
+        iter += 1
+
+    return cluster, near
+
+
+def load_data(path):
+    data = []
+    # -------------------------------------------------------------#
+    #   对于每一个xml都寻找box
+    # -------------------------------------------------------------#
+    for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
+        tree = ET.parse(xml_file)
+        height = int(tree.findtext('./size/height'))
+        width = int(tree.findtext('./size/width'))
+        if height <= 0 or width <= 0:
+            continue
+
+        # -------------------------------------------------------------#
+        #   对于每一个目标都获得它的宽高
+        # -------------------------------------------------------------#
+        for obj in tree.iter('object'):
+            xmin = int(float(obj.findtext('bndbox/xmin'))) / width
+            ymin = int(float(obj.findtext('bndbox/ymin'))) / height
+            xmax = int(float(obj.findtext('bndbox/xmax'))) / width
+            ymax = int(float(obj.findtext('bndbox/ymax'))) / height
+
+            xmin = np.float64(xmin)
+            ymin = np.float64(ymin)
+            xmax = np.float64(xmax)
+            ymax = np.float64(ymax)
+            # 得到宽高
+            data.append([xmax - xmin, ymax - ymin])
+    return np.array(data)
+
+
+if __name__ == '__main__':
+    np.random.seed(0)
+    # -------------------------------------------------------------#
+    #   运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
+    #   会生成yolo_anchors.txt
+    # -------------------------------------------------------------#
+    input_shape = [416, 416]
+    anchors_num = 9
+    # -------------------------------------------------------------#
+    #   载入数据集，可以使用VOC的xml
+    # -------------------------------------------------------------#
+    path = 'VOCdevkit/VOC2007/Annotations'
+
+    # -------------------------------------------------------------#
+    #   载入所有的xml
+    #   存储格式为转化为比例后的width,height
+    # -------------------------------------------------------------#
+    print('Load xmls.')
+    data = load_data(path)
+    print('Load xmls done.')
+
+    # -------------------------------------------------------------#
+    #   使用k聚类算法
+    # -------------------------------------------------------------#
+    print('K-means boxes.')
+    cluster, near = kmeans(data, anchors_num)
+    print('K-means boxes done.')
+    data = data * np.array([input_shape[1], input_shape[0]])
+    cluster = cluster * np.array([input_shape[1], input_shape[0]])
+
+    # -------------------------------------------------------------#
+    #   绘图
+    # -------------------------------------------------------------#
+    for j in range(anchors_num):
+        plt.scatter(data[near == j][:, 0], data[near == j][:, 1])
+        plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
+    plt.savefig("kmeans_for_anchors.jpg")
+    plt.show()
+    print('Save kmeans_for_anchors.jpg in root dir.')
+
+    cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
+    print('avg_ratio:{:.2f}'.format(avg_iou(data, cluster)))
+    print(cluster)
+
+    f = open("yolo_anchors.txt", 'w')
+    row = np.shape(cluster)[0]
+    for i in range(row):
+        if i == 0:
+            x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
+        else:
+            x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
+        f.write(x_y)
+    f.close()
--- a/load_data.py
+++ b/load_data.py
@ -0,0 +1,531 @@
+import fnmatch
+import math
+import os
+import sys
+import time
+from operator import itemgetter
+
+import gc
+import numpy as np
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+# from darknet import Darknet
+
+from median_pool import MedianPool2d
+
+# print('starting test read')
+# im = Image.open('data/horse.jpg').convert('RGB')
+# print('img read!')
+
+
+class MaxProbExtractor(nn.Module):
+    """MaxProbExtractor: extracts max class probability for class from YOLO output.
+
+    Module providing the functionality necessary to extract the max class probability for one class from YOLO output.
+
+    """
+
+    def __init__(self, cls_id, num_cls, config):
+        super(MaxProbExtractor, self).__init__()
+        self.cls_id = cls_id
+        self.num_cls = num_cls
+        self.config = config
+        self.anchor_num = 3
+
+    def forward(self, YOLOoutput):
+        # get values neccesary for transformation
+        if YOLOoutput.dim() == 3:
+            YOLOoutput = YOLOoutput.unsqueeze(0)
+        batch = YOLOoutput.size(0)
+        assert (YOLOoutput.size(1) == (5 + self.num_cls) * self.anchor_num)
+        h = YOLOoutput.size(2)
+        w = YOLOoutput.size(3)
+        # transform the output tensor from [batch, 425, 19, 19] to [batch, 80, 1805]
+        output = YOLOoutput.view(batch, self.anchor_num, 5 + self.num_cls, h * w)  # [batch, 5, 85, 361]
+        output = output.transpose(1, 2).contiguous()  # [batch, 85, 5, 361]
+        output = output.view(batch, 5 + self.num_cls, self.anchor_num * h * w)  # [batch, 85, 1805]
+        output_objectness = torch.sigmoid(output[:, 4, :])  # [batch, 1805]  # 是否有物体
+        output = output[:, 5:5 + self.num_cls, :]  # [batch, 80, 1805]
+        # perform softmax to normalize probabilities for object classes to [0,1]
+        normal_confs = torch.nn.Softmax(dim=1)(output)  # 物体类别
+        # we only care for probabilities of the class of interest (person)
+        confs_for_class = normal_confs[:, self.cls_id, :]   # 类别 序号对应的为人
+        confs_if_object = output_objectness  # confs_for_class * output_objectness
+        confs_if_object = confs_for_class * output_objectness
+        confs_if_object = self.config.loss_target(output_objectness, confs_for_class)
+        # find the max probability for person
+        max_conf, max_conf_idx = torch.max(confs_if_object, dim=1)
+
+        return max_conf
+
+
+class NPSCalculator(nn.Module):
+    """NMSCalculator: calculates the non-printability score of a patch.
+
+    Module providing the functionality necessary to calculate the non-printability score (NMS) of an adversarial patch.
+
+    """
+
+    def __init__(self, printability_file, patch_side):
+        super(NPSCalculator, self).__init__()
+        self.printability_array = nn.Parameter(self.get_printability_array(printability_file, patch_side),
+                                               requires_grad=False)
+
+    def forward(self, adv_patch):
+        # calculate euclidian distance between colors in patch and colors in printability_array 
+        # square root of sum of squared difference
+        color_dist = (adv_patch - self.printability_array + 0.000001)
+        color_dist = color_dist ** 2
+        color_dist = torch.sum(color_dist, 1) + 0.000001
+        color_dist = torch.sqrt(color_dist)
+        # only work with the min distance
+        color_dist_prod = torch.min(color_dist, 0)[0]  # test: change prod for min (find distance to closest color)
+        # calculate the nps by summing over all pixels
+        nps_score = torch.sum(color_dist_prod, 0)
+        nps_score = torch.sum(nps_score, 0)
+        return nps_score / torch.numel(adv_patch)
+
+    def get_printability_array(self, printability_file, side):
+        printability_list = []
+
+        # read in printability triplets and put them in a list
+        with open(printability_file) as f:
+            for line in f:
+                printability_list.append(line.split(","))
+
+        printability_array = []
+        for printability_triplet in printability_list:
+            printability_imgs = []
+            red, green, blue = printability_triplet
+            printability_imgs.append(np.full((side, side), red))
+            printability_imgs.append(np.full((side, side), green))
+            printability_imgs.append(np.full((side, side), blue))
+            printability_array.append(printability_imgs)
+
+        printability_array = np.asarray(printability_array)
+        printability_array = np.float32(printability_array)
+        pa = torch.from_numpy(printability_array)
+        return pa
+
+
+class TotalVariation(nn.Module):
+    """TotalVariation: calculates the total variation of a patch.
+
+    Module providing the functionality necessary to calculate the total Variation (TV) of an adversarial patch.
+
+    TotalVariation：计算补丁的总变化。
+    该模块提供了计算对抗性补丁的总变化 (TV) 所需的功能。
+
+    """
+
+    def __init__(self):
+        super(TotalVariation, self).__init__()
+
+    def forward(self, adv_patch):
+        # bereken de total variation van de adv_patch
+        tvcomp1 = torch.sum(torch.abs(adv_patch[:, :, 1:] - adv_patch[:, :, :-1] + 0.000001), 0)
+        tvcomp1 = torch.sum(torch.sum(tvcomp1, 0), 0)
+        tvcomp2 = torch.sum(torch.abs(adv_patch[:, 1:, :] - adv_patch[:, :-1, :] + 0.000001), 0)
+        tvcomp2 = torch.sum(torch.sum(tvcomp2, 0), 0)
+        tv = tvcomp1 + tvcomp2
+        return tv / torch.numel(adv_patch)
+
+
+class PatchTransformer(nn.Module):
+    """PatchTransformer: transforms batch of patches
+
+    Module providing the functionality necessary to transform a batch of patches, randomly adjusting brightness and
+    contrast, adding random amount of noise, and rotating randomly. Resizes-patches according to as size based on the
+    batch of labels, and pads them to the dimension of an image.
+
+    变换一批补丁，随机调整亮度和对比度，添加随机数量的噪声，随机旋转。 根据标签批次的大小调整补丁大小，并将它们填充到图像的尺寸中。
+
+    """
+
+    def __init__(self):
+        super(PatchTransformer, self).__init__()
+        self.min_contrast = 0.8
+        self.max_contrast = 1.2
+        self.min_brightness = -0.1
+        self.max_brightness = 0.1
+        self.noise_factor = 0.10
+        self.minangle = -20 / 180 * math.pi
+        self.maxangle = 20 / 180 * math.pi
+        self.medianpooler = MedianPool2d(7, same=True)  # 中值池化
+        '''
+        kernel = torch.cuda.FloatTensor([[0.003765, 0.015019, 0.023792, 0.015019, 0.003765],                                                                                    
+                                         [0.015019, 0.059912, 0.094907, 0.059912, 0.015019],                                                                                    
+                                         [0.023792, 0.094907, 0.150342, 0.094907, 0.023792],                                                                                    
+                                         [0.015019, 0.059912, 0.094907, 0.059912, 0.015019],                                                                                    
+                                         [0.003765, 0.015019, 0.023792, 0.015019, 0.003765]])
+        self.kernel = kernel.unsqueeze(0).unsqueeze(0).expand(3,3,-1,-1)
+        '''
+
+    def forward(self, adv_patch, lab_batch, img_size, do_rotate=True, rand_loc=True):
+        # adv_patch = F.conv2d(adv_patch.unsqueeze(0),self.kernel,padding=(2,2))
+        adv_patch = self.medianpooler(adv_patch.unsqueeze(0))
+        # Determine size of padding
+        pad = (img_size - adv_patch.size(-1)) / 2
+        # Make a batch of patches
+        adv_patch = adv_patch.unsqueeze(0)  # .unsqueeze(0)  # 这里又扩大一维，变成5维  1, 1, 3, 300, 300
+        adv_batch = adv_patch.expand(lab_batch.size(0), lab_batch.size(1), -1, -1, -1)  # adv_batch ！！ 不是adv_patch！！ 8, 14, 3, 300, 300
+        batch_size = torch.Size((lab_batch.size(0), lab_batch.size(1)))  # 8, 14
+
+        # Contrast, brightness and noise transforms
+
+        # Create random contrast tensor
+        contrast = torch.cuda.FloatTensor(batch_size).uniform_(self.min_contrast, self.max_contrast)
+        contrast = contrast.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+        contrast = contrast.expand(-1, -1, adv_batch.size(-3), adv_batch.size(-2), adv_batch.size(-1))
+        contrast = contrast.cuda()
+
+        # Create random brightness tensor
+        brightness = torch.cuda.FloatTensor(batch_size).uniform_(self.min_brightness, self.max_brightness)
+        brightness = brightness.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+        brightness = brightness.expand(-1, -1, adv_batch.size(-3), adv_batch.size(-2), adv_batch.size(-1))
+        brightness = brightness.cuda()
+
+        # Create random noise tensor
+        noise = torch.cuda.FloatTensor(adv_batch.size()).uniform_(-1, 1) * self.noise_factor
+
+        # Apply contrast/brightness/noise, clamp
+        adv_batch = adv_batch * contrast + brightness + noise
+
+        adv_batch = torch.clamp(adv_batch, 0.000001, 0.99999)  # 限制到0到1之间
+
+        # Where the label class_id is 1 we don't want a patch (padding) --> fill mask with zero's
+        cls_ids = torch.narrow(lab_batch, 2, 0, 1)  # torch.narrow(input,dim,start,length)  从dim开始，返回共享内存的数据start到start+length-1
+        cls_mask = cls_ids.expand(-1, -1, 3)        # 接上，这里取出 lab_batch的代表id那列，相当于现在的lab_batch[..., 0]
+        cls_mask = cls_mask.unsqueeze(-1)
+        cls_mask = cls_mask.expand(-1, -1, -1, adv_batch.size(3))
+        cls_mask = cls_mask.unsqueeze(-1)
+        cls_mask = cls_mask.expand(-1, -1, -1, -1, adv_batch.size(4))  # cls_mask 的大小是 8, 14, 3, 300, 300  数据是类别
+        msk_batch = torch.cuda.FloatTensor(cls_mask.size()).fill_(1) - cls_mask  # 这里取出有人所对应的msk
+
+        # Pad patch and mask to image dimensions
+        mypad = nn.ConstantPad2d((int(pad + 0.5), int(pad), int(pad + 0.5), int(pad)), 0)  # (padding_left、padding_right、padding_top、padding_bottom) 填充0
+        adv_batch = mypad(adv_batch)  # 用0填充到416
+        msk_batch = mypad(msk_batch)
+
+        # Rotation and rescaling transforms
+        anglesize = (lab_batch.size(0) * lab_batch.size(1))  # 这里是旋转的数量
+        if do_rotate:
+            angle = torch.cuda.FloatTensor(anglesize).uniform_(self.minangle, self.maxangle)
+        else:
+            angle = torch.cuda.FloatTensor(anglesize).fill_(0)
+
+        # Resizes and rotates
+        current_patch_size = adv_patch.size(-1)
+        lab_batch_scaled = torch.cuda.FloatTensor(lab_batch.size()).fill_(0)  # lab_batch_scaled是在原图上的尺寸？
+        lab_batch_scaled[:, :, 1] = lab_batch[:, :, 1] * img_size
+        lab_batch_scaled[:, :, 2] = lab_batch[:, :, 2] * img_size
+        lab_batch_scaled[:, :, 3] = lab_batch[:, :, 3] * img_size
+        lab_batch_scaled[:, :, 4] = lab_batch[:, :, 4] * img_size
+        target_size = torch.sqrt(
+            ((lab_batch_scaled[:, :, 3].mul(0.2)) ** 2) + ((lab_batch_scaled[:, :, 4].mul(0.2)) ** 2))
+        target_x = lab_batch[:, :, 1].view(np.prod(batch_size))
+        target_y = lab_batch[:, :, 2].view(np.prod(batch_size))
+        targetoff_x = lab_batch[:, :, 3].view(np.prod(batch_size))
+        targetoff_y = lab_batch[:, :, 4].view(np.prod(batch_size))
+        if (rand_loc):
+            off_x = targetoff_x * (torch.cuda.FloatTensor(targetoff_x.size()).uniform_(-0.4, 0.4))
+            target_x = target_x + off_x
+            off_y = targetoff_y * (torch.cuda.FloatTensor(targetoff_y.size()).uniform_(-0.4, 0.4))
+            target_y = target_y + off_y
+        target_y = target_y - 0.05
+        scale = target_size / current_patch_size  # 原图相对于补丁大小的缩放因子？
+        scale = scale.view(anglesize)
+
+        s = adv_batch.size()
+        adv_batch = adv_batch.view(s[0] * s[1], s[2], s[3], s[4])
+        msk_batch = msk_batch.view(s[0] * s[1], s[2], s[3], s[4])
+
+        tx = (-target_x + 0.5) * 2
+        ty = (-target_y + 0.5) * 2
+        sin = torch.sin(angle)
+        cos = torch.cos(angle)
+
+        # Theta = rotation,rescale matrix
+        theta = torch.cuda.FloatTensor(anglesize, 2, 3).fill_(0)
+        theta[:, 0, 0] = cos / scale
+        theta[:, 0, 1] = sin / scale
+        theta[:, 0, 2] = tx * cos / scale + ty * sin / scale
+        theta[:, 1, 0] = -sin / scale
+        theta[:, 1, 1] = cos / scale
+        theta[:, 1, 2] = -tx * sin / scale + ty * cos / scale
+
+        b_sh = adv_batch.shape
+        grid = F.affine_grid(theta, adv_batch.shape)
+
+        adv_batch_t = F.grid_sample(adv_batch, grid)
+        msk_batch_t = F.grid_sample(msk_batch, grid)
+
+        '''
+        # Theta2 = translation matrix
+        theta2 = torch.cuda.FloatTensor(anglesize, 2, 3).fill_(0)
+        theta2[:, 0, 0] = 1
+        theta2[:, 0, 1] = 0
+        theta2[:, 0, 2] = (-target_x + 0.5) * 2
+        theta2[:, 1, 0] = 0
+        theta2[:, 1, 1] = 1
+        theta2[:, 1, 2] = (-target_y + 0.5) * 2
+
+        grid2 = F.affine_grid(theta2, adv_batch.shape)
+        adv_batch_t = F.grid_sample(adv_batch_t, grid2)
+        msk_batch_t = F.grid_sample(msk_batch_t, grid2)
+
+        '''
+        adv_batch_t = adv_batch_t.view(s[0], s[1], s[2], s[3], s[4])
+        msk_batch_t = msk_batch_t.view(s[0], s[1], s[2], s[3], s[4])
+
+        adv_batch_t = torch.clamp(adv_batch_t, 0.000001, 0.999999)
+        # img = msk_batch_t[0, 0, :, :, :].detach().cpu()
+        # img = transforms.ToPILImage()(img)
+        # img.show()
+        # exit()
+
+        return adv_batch_t * msk_batch_t
+
+
+class PatchApplier(nn.Module):
+    """PatchApplier: applies adversarial patches to images.
+
+    Module providing the functionality necessary to apply a patch to all detections in all images in the batch.
+
+    PatchApplier：对图像应用对抗补丁。
+
+    """
+
+    def __init__(self):
+        super(PatchApplier, self).__init__()
+
+    def forward(self, img_batch, adv_batch):
+        advs = torch.unbind(adv_batch, 1)  # 沿1维解开
+        for adv in advs:
+            img_batch = torch.where((adv == 0), img_batch, adv)  # 对图像相应的坐标位置替换其像素？好像还没到图像的环节
+        return img_batch
+
+
+'''
+class PatchGenerator(nn.Module):
+    """PatchGenerator: network module that generates adversarial patches.
+
+    Module representing the neural network that will generate adversarial patches.
+
+    """
+
+    def __init__(self, cfgfile, weightfile, img_dir, lab_dir):
+        super(PatchGenerator, self).__init__()
+        self.yolo = Darknet(cfgfile).load_weights(weightfile)
+        self.dataloader = torch.utils.data.DataLoader(InriaDataset(img_dir, lab_dir, shuffle=True),
+                                                      batch_size=5,
+                                                      shuffle=True)
+        self.patchapplier = PatchApplier()
+        self.nmscalculator = NMSCalculator()
+        self.totalvariation = TotalVariation()
+
+    def forward(self, *input):
+        pass
+'''
+
+
+class InriaDataset(Dataset):
+    """InriaDataset: representation of the INRIA person dataset.
+
+    Internal representation of the commonly used INRIA person dataset.
+    Available at: http://pascal.inrialpes.fr/data/human/
+
+    Attributes:
+        len: An integer number of elements in the
+        img_dir: Directory containing the images of the INRIA dataset.
+        lab_dir: Directory containing the labels of the INRIA dataset.
+        img_names: List of all image file names in img_dir.
+        shuffle: Whether or not to shuffle the dataset.
+
+    """
+
+    def __init__(self, img_dir, lab_dir, max_lab, imgsize, shuffle=True):
+        n_png_images = len(fnmatch.filter(os.listdir(img_dir), '*.png'))  # 614  fnmatch.filter返回一个list
+        n_jpg_images = len(fnmatch.filter(os.listdir(img_dir), '*.jpg'))  # 0
+        n_images = n_png_images + n_jpg_images  # 图像的总数
+        n_labels = len(fnmatch.filter(os.listdir(lab_dir), '*.txt'))
+        assert n_images == n_labels, "Number of images and number of labels don't match"
+        self.len = n_images
+        self.img_dir = img_dir
+        self.lab_dir = lab_dir
+        self.imgsize = imgsize
+        self.img_names = fnmatch.filter(os.listdir(img_dir), '*.png') + fnmatch.filter(os.listdir(img_dir), '*.jpg')
+        self.shuffle = shuffle
+        self.img_paths = []
+        for img_name in self.img_names:
+            self.img_paths.append(os.path.join(self.img_dir, img_name))
+        self.lab_paths = []
+        for img_name in self.img_names:
+            lab_path = os.path.join(self.lab_dir, img_name).replace('.jpg', '.txt').replace('.png', '.txt')
+            self.lab_paths.append(lab_path)
+        self.max_n_labels = max_lab  # label的长度
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, idx):
+        assert idx <= len(self), 'index range error'
+        img_path = os.path.join(self.img_dir, self.img_names[idx])
+        lab_path = os.path.join(self.lab_dir, self.img_names[idx]).replace('.jpg', '.txt').replace('.png', '.txt')
+        image = Image.open(img_path).convert('RGB')
+        if os.path.getsize(lab_path):  # check to see if label file contains data.
+            label = np.loadtxt(lab_path)
+        else:
+            label = np.ones([5])
+
+        label = torch.from_numpy(label).float()
+        if label.dim() == 1:
+            label = label.unsqueeze(0)
+
+        image, label = self.pad_and_scale(image, label)
+        transform = transforms.ToTensor()
+        image = transform(image)
+        label = self.pad_lab(label)
+        # print("image size :", image.shape)
+        # print("label size :", label.shape)
+        return image, label
+
+    def pad_and_scale(self, img, lab):
+        """
+
+        Args:
+            img:
+
+        Returns:
+
+        """
+        w, h = img.size
+        if w == h:
+            padded_img = img
+        else:
+            dim_to_pad = 1 if w < h else 2
+            if dim_to_pad == 1:
+                padding = (h - w) / 2
+                padded_img = Image.new('RGB', (h, h), color=(127, 127, 127))
+                padded_img.paste(img, (int(padding), 0))
+                lab[:, [1]] = (lab[:, [1]] * w + padding) / h
+                lab[:, [3]] = (lab[:, [3]] * w / h)
+            else:
+                padding = (w - h) / 2
+                padded_img = Image.new('RGB', (w, w), color=(127, 127, 127))
+                padded_img.paste(img, (0, int(padding)))
+                lab[:, [2]] = (lab[:, [2]] * h + padding) / w
+                lab[:, [4]] = (lab[:, [4]] * h / w)
+        resize = transforms.Resize((self.imgsize, self.imgsize))
+        padded_img = resize(padded_img)  # choose here
+        return padded_img, lab
+
+    def pad_lab(self, lab):
+        pad_size = self.max_n_labels - lab.shape[0]
+        if (pad_size > 0):
+            padded_lab = F.pad(lab, (0, 0, 0, pad_size), value=1)  # (左边填充数， 右边填充数， 上边填充数， 下边填充数)
+        else:
+            padded_lab = lab
+        return padded_lab
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        img_dir = sys.argv[1]
+        lab_dir = sys.argv[2]
+
+    else:
+        print('Usage: ')
+        print('  python load_data.py img_dir lab_dir')
+        sys.exit()
+
+    test_loader = torch.utils.data.DataLoader(InriaDataset(img_dir, lab_dir, shuffle=True),
+                                              batch_size=3, shuffle=True)
+
+    cfgfile = "cfg/yolov2.cfg"
+    weightfile = "weights/yolov2.weights"
+    printfile = "non_printability/30values.txt"
+
+    patch_size = 400
+
+    darknet_model = Darknet(cfgfile)
+    darknet_model.load_weights(weightfile)
+    darknet_model = darknet_model.cuda()
+    patch_applier = PatchApplier().cuda()
+    patch_transformer = PatchTransformer().cuda()
+    prob_extractor = MaxProbExtractor(0, 80).cuda()
+    nms_calculator = NMSCalculator(printfile, patch_size)
+    total_variation = TotalVariation()
+    '''
+    img = Image.open('data/horse.jpg').convert('RGB')
+    img = img.resize((darknet_model.width, darknet_model.height))
+    width = img.width
+    height = img.height
+    img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
+    img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous()
+    img = img.view(1, 3, height, width)
+    img = img.float().div(255.0)
+    img = torch.autograd.Variable(img)
+
+    output = darknet_model(img)
+    '''
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
+
+    tl0 = time.time()
+    tl1 = time.time()
+    for i_batch, (img_batch, lab_batch) in enumerate(test_loader):
+        tl1 = time.time()
+        print('time to fetch items: ', tl1 - tl0)
+        img_batch = img_batch.cuda()
+        lab_batch = lab_batch.cuda()
+        adv_patch = Image.open('data/horse.jpg').convert('RGB')
+        adv_patch = adv_patch.resize((patch_size, patch_size))
+        transform = transforms.ToTensor()
+        adv_patch = transform(adv_patch).cuda()
+        img_size = img_batch.size(-1)
+        print('transforming patches')
+        t0 = time.time()
+        adv_batch_t = patch_transformer.forward(adv_patch, lab_batch, img_size)
+        print('applying patches')
+        t1 = time.time()
+        img_batch = patch_applier.forward(img_batch, adv_batch_t)
+        img_batch = torch.autograd.Variable(img_batch)
+        img_batch = F.interpolate(img_batch, (darknet_model.height, darknet_model.width))
+        print('running patched images through model')
+        t2 = time.time()
+
+        for obj in gc.get_objects():
+            try:
+                if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
+                    try:
+                        print(type(obj), obj.size())
+                    except:
+                        pass
+            except:
+                pass
+
+        print(torch.cuda.memory_allocated())
+
+        output = darknet_model(img_batch)
+        print('extracting max probs')
+        t3 = time.time()
+        max_prob = prob_extractor(output)
+        t4 = time.time()
+        nms = nms_calculator.forward(adv_patch)
+        tv = total_variation(adv_patch)
+        print('---------------------------------')
+        print('        patch transformation : %f' % (t1 - t0))
+        print('           patch application : %f' % (t2 - t1))
+        print('             darknet forward : %f' % (t3 - t2))
+        print('      probability extraction : %f' % (t4 - t3))
+        print('---------------------------------')
+        print('          total forward pass : %f' % (t4 - t0))
+        del img_batch, lab_batch, adv_patch, adv_batch_t, output, max_prob
+        torch.cuda.empty_cache()
+        tl0 = time.time()
--- a/median_pool.py
+++ b/median_pool.py
@ -0,0 +1,50 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair, _quadruple
+
+
+class MedianPool2d(nn.Module):
+    """ Median pool (usable as median filter when stride=1) module.
+    
+    Args:
+         kernel_size: size of pooling kernel, int or 2-tuple
+         stride: pool stride, int or 2-tuple
+         padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
+         same: override padding and enforce same padding, boolean
+    """
+    def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
+        super(MedianPool2d, self).__init__()
+        self.k = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _quadruple(padding)  # convert to l, r, t, b
+        self.same = same
+
+    def _padding(self, x):
+        if self.same:
+            ih, iw = x.size()[2:]
+            if ih % self.stride[0] == 0:
+                ph = max(self.k[0] - self.stride[0], 0)
+            else:
+                ph = max(self.k[0] - (ih % self.stride[0]), 0)
+            if iw % self.stride[1] == 0:
+                pw = max(self.k[1] - self.stride[1], 0)
+            else:
+                pw = max(self.k[1] - (iw % self.stride[1]), 0)
+            pl = pw // 2
+            pr = pw - pl
+            pt = ph // 2
+            pb = ph - pt
+            padding = (pl, pr, pt, pb)
+        else:
+            padding = self.padding
+        return padding
+    
+    def forward(self, x):
+        # using existing pytorch functions and tensor ops so that we get autograd, 
+        # would likely be more efficient to implement from scratch at C/Cuda level
+        x = F.pad(x, self._padding(x), mode='reflect')
+        x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1])
+        x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0]
+        return x
--- a/nets/init.py
+++ b/nets/init.py
@ -0,0 +1 @@
+#
--- a/nets/darknet.py
+++ b/nets/darknet.py
@ -0,0 +1,101 @@
+import math
+from collections import OrderedDict
+
+import torch.nn as nn
+
+
+# ---------------------------------------------------------------------#
+#   残差结构
+#   利用一个1x1卷积下降通道数，然后利用一个3x3卷积提取特征并且上升通道数
+#   最后接上一个残差边
+# ---------------------------------------------------------------------#
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)  # 从大通道转化小通道。又从小通道转为大通道。
+        self.bn1 = nn.BatchNorm2d(planes[0])
+        self.relu1 = nn.LeakyReLU(0.1)
+
+        self.conv2 = nn.Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes[1])
+        self.relu2 = nn.LeakyReLU(0.1)
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu2(out)
+
+        out += residual
+        return out
+
+
+class DarkNet(nn.Module):
+    def __init__(self, layers):
+        super(DarkNet, self).__init__()
+        self.inplanes = 32  # 第一次卷积，输出通道为32
+        # 416,416,3 -> 416,416,32
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes)
+        self.relu1 = nn.LeakyReLU(0.1)
+
+        # 416,416,32 -> 208,208,64
+        self.layer1 = self._make_layer([32, 64], layers[0])  # layers 中保存的是程序块重复的次数
+        # 208,208,64 -> 104,104,128
+        self.layer2 = self._make_layer([64, 128], layers[1])
+        # 104,104,128 -> 52,52,256
+        self.layer3 = self._make_layer([128, 256], layers[2])
+        # 52,52,256 -> 26,26,512
+        self.layer4 = self._make_layer([256, 512], layers[3])
+        # 26,26,512 -> 13,13,1024
+        self.layer5 = self._make_layer([512, 1024], layers[4])
+
+        self.layers_out_filters = [64, 128, 256, 512, 1024]
+
+        # 进行权值初始化
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    # ---------------------------------------------------------------------#
+    #   在每一个layer里面，首先利用一个步长为2的3x3卷积进行下采样
+    #   然后进行残差结构的堆叠
+    # ---------------------------------------------------------------------#
+    def _make_layer(self, planes, blocks):
+        layers = []
+        # 下采样，步长为2，卷积核大小为3  # 进入_make_layer先创建一层网络，用于降采样，然后再是多个重复的block
+        layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
+        layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
+        layers.append(("ds_relu", nn.LeakyReLU(0.1)))
+        # 加入残差结构
+        self.inplanes = planes[1]  # 保存这一层的输出通道，也是下一层的输入通道
+        for i in range(0, blocks):
+            layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes)))
+        return nn.Sequential(OrderedDict(layers))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        out3 = self.layer3(x)
+        out4 = self.layer4(out3)
+        out5 = self.layer5(out4)
+
+        return out3, out4, out5
+
+
+def darknet53():
+    model = DarkNet([1, 2, 8, 8, 4])
+    return model
--- a/nets/yolo.py
+++ b/nets/yolo.py
@ -0,0 +1,111 @@
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from nets.darknet import darknet53
+
+
+def conv2d(filter_in, filter_out, kernel_size):
+    pad = (kernel_size - 1) // 2 if kernel_size else 0
+    return nn.Sequential(OrderedDict([
+        ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
+        ("bn", nn.BatchNorm2d(filter_out)),
+        ("relu", nn.LeakyReLU(0.1)),
+    ]))
+
+
+# ------------------------------------------------------------------------#
+#   make_last_layers里面一共有七个卷积，前五个用于提取特征。
+#   后两个用于获得yolo网络的预测结果
+# ------------------------------------------------------------------------#
+def make_last_layers(filters_list, in_filters, out_filter):
+    m = nn.Sequential(
+        conv2d(in_filters, filters_list[0], 1),  # 多次使用 1*1 的卷积调整通道，并进行通道方向的信息融合
+        conv2d(filters_list[0], filters_list[1], 3),
+        conv2d(filters_list[1], filters_list[0], 1),
+        conv2d(filters_list[0], filters_list[1], 3),
+        conv2d(filters_list[1], filters_list[0], 1),
+        conv2d(filters_list[0], filters_list[1], 3),
+        nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
+    )
+    return m
+
+
+class YoloBody(nn.Module):
+    def __init__(self, anchors_mask, num_classes, pretrained=False):
+        super(YoloBody, self).__init__()
+        self.width = 416  # 临时加
+        self.height = 416  # 临时加
+        # ---------------------------------------------------#
+        #   生成darknet53的主干模型
+        #   获得三个有效特征层，他们的shape分别是：
+        #   52,52,256
+        #   26,26,512
+        #   13,13,1024
+        # ---------------------------------------------------#
+        self.backbone = darknet53()
+        if pretrained:  # 载入预训练的权重，darknet53是一个分类网络
+            self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth"))
+
+        # ---------------------------------------------------#
+        #   out_filters : [64, 128, 256, 512, 1024]
+        # ---------------------------------------------------#
+        out_filters = self.backbone.layers_out_filters
+
+        # ------------------------------------------------------------------------#
+        #   计算yolo_head的输出通道数，对于voc数据集而言
+        #   final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
+        # ------------------------------------------------------------------------#  len(anchors_mask[0]) 为 3
+        self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
+
+        self.last_layer1_conv = conv2d(512, 256, 1)
+        self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
+
+        self.last_layer2_conv = conv2d(256, 128, 1)
+        self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
+
+    def forward(self, x):
+        # ---------------------------------------------------#
+        #   获得三个有效特征层，他们的shape分别是：
+        #   52,52,256；26,26,512；13,13,1024
+        # ---------------------------------------------------#
+        x2, x1, x0 = self.backbone(x)
+
+        # ---------------------------------------------------#
+        #   第一个特征层
+        #   out0 = (batch_size,255,13,13)
+        # ---------------------------------------------------#
+        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
+        out0_branch = self.last_layer0[:5](x0)
+        out0 = self.last_layer0[5:](out0_branch)  # 8, 75, 13, 13   刚开始的2是测试用的，不是正式数据
+
+        # 13,13,512 -> 13,13,256 -> 26,26,256
+        x1_in = self.last_layer1_conv(out0_branch)  # 融合分支
+        x1_in = self.last_layer1_upsample(x1_in)
+
+        # 26,26,256 + 26,26,512 -> 26,26,768
+        x1_in = torch.cat([x1_in, x1], 1)
+        # ---------------------------------------------------#
+        #   第二个特征层
+        #   out1 = (batch_size,255,26,26)
+        # ---------------------------------------------------#
+        # 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
+        out1_branch = self.last_layer1[:5](x1_in)
+        out1 = self.last_layer1[5:](out1_branch)
+
+        # 26,26,256 -> 26,26,128 -> 52,52,128
+        x2_in = self.last_layer2_conv(out1_branch)  # 融合
+        x2_in = self.last_layer2_upsample(x2_in)
+
+        # 52,52,128 + 52,52,256 -> 52,52,384
+        x2_in = torch.cat([x2_in, x2], 1)
+        # ---------------------------------------------------#
+        #   第三个特征层
+        #   out3 = (batch_size,255,52,52)
+        # ---------------------------------------------------#
+        # 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
+        out2 = self.last_layer2(x2_in)
+        return out0, out1, out2
--- a/nets/yolo_training.py
+++ b/nets/yolo_training.py
@ -0,0 +1,488 @@
+import math
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class YOLOLoss(nn.Module):
+    def __init__(self, anchors, num_classes, input_shape, cuda, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]]):
+        super(YOLOLoss, self).__init__()
+        # -----------------------------------------------------------#
+        #   13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
+        #   26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
+        #   52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
+        # -----------------------------------------------------------#
+        self.anchors = anchors
+        self.num_classes = num_classes
+        self.bbox_attrs = 5 + num_classes
+        self.input_shape = input_shape
+        self.anchors_mask = anchors_mask
+
+        self.giou = True
+        self.balance = [0.4, 1.0, 4]
+        self.box_ratio = 0.05
+        self.obj_ratio = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
+        self.cls_ratio = 1 * (num_classes / 80)
+
+        self.ignore_threshold = 0.5
+        self.cuda = cuda
+
+    def clip_by_tensor(self, t, t_min, t_max):
+        t = t.float()
+        result = (t >= t_min).float() * t + (t < t_min).float() * t_min  # 要么是t，要么是t_min
+        result = (result <= t_max).float() * result + (result > t_max).float() * t_max
+        return result
+
+    def MSELoss(self, pred, target):
+        return torch.pow(pred - target, 2)
+
+    def BCELoss(self, pred, target):
+        epsilon = 1e-7
+        pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)  # 保证tensor在  epsilon和1.0 - epsilon之间
+        output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
+        return output
+
+    def box_giou(self, b1, b2):
+        """
+        输入为：
+        ----------
+        b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
+        b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
+
+        返回为：
+        -------
+        giou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
+        """
+        # ----------------------------------------------------#
+        #   求出预测框左上角右下角
+        # ----------------------------------------------------#
+        b1_xy = b1[..., :2]
+        b1_wh = b1[..., 2:4]
+        b1_wh_half = b1_wh / 2.
+        b1_mins = b1_xy - b1_wh_half
+        b1_maxes = b1_xy + b1_wh_half
+        # ----------------------------------------------------#
+        #   求出真实框左上角右下角
+        # ----------------------------------------------------#
+        b2_xy = b2[..., :2]
+        b2_wh = b2[..., 2:4]
+        b2_wh_half = b2_wh / 2.
+        b2_mins = b2_xy - b2_wh_half
+        b2_maxes = b2_xy + b2_wh_half
+
+        # ----------------------------------------------------#
+        #   求真实框和预测框所有的iou
+        # ----------------------------------------------------#
+        intersect_mins = torch.max(b1_mins, b2_mins)
+        intersect_maxes = torch.min(b1_maxes, b2_maxes)
+        intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
+        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+        b1_area = b1_wh[..., 0] * b1_wh[..., 1]
+        b2_area = b2_wh[..., 0] * b2_wh[..., 1]
+        union_area = b1_area + b2_area - intersect_area
+        iou = intersect_area / union_area
+
+        # ----------------------------------------------------#
+        #   找到包裹两个框的最小框的左上角和右下角
+        # ----------------------------------------------------#
+        enclose_mins = torch.min(b1_mins, b2_mins)
+        enclose_maxes = torch.max(b1_maxes, b2_maxes)
+        enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
+        # ----------------------------------------------------#
+        #   计算对角线距离
+        # ----------------------------------------------------#
+        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+        giou = iou - (enclose_area - union_area) / enclose_area
+
+        return giou
+
+    def forward(self, l, input, targets=None):
+        # ----------------------------------------------------#
+        #   l代表的是，当前输入进来的有效特征层，是第几个有效特征层
+        #   input的shape为  bs, 3*(5+num_classes), 13, 13
+        #                   bs, 3*(5+num_classes), 26, 26
+        #                   bs, 3*(5+num_classes), 52, 52
+        #   targets代表的是真实框。
+        # ----------------------------------------------------#
+        # --------------------------------#
+        #   获得图片数量，特征层的高和宽
+        #   13和13
+        # --------------------------------#
+        bs = input.size(0)
+        in_h = input.size(2)
+        in_w = input.size(3)
+        # -----------------------------------------------------------------------#
+        #   计算步长
+        #   每一个特征点对应原来的图片上多少个像素点
+        #   如果特征层为13x13的话，一个特征点就对应原来的图片上的32个像素点
+        #   如果特征层为26x26的话，一个特征点就对应原来的图片上的16个像素点
+        #   如果特征层为52x52的话，一个特征点就对应原来的图片上的8个像素点
+        #   stride_h = stride_w = 32、16、8
+        #   stride_h和stride_w都是32。
+        # -----------------------------------------------------------------------#
+        stride_h = self.input_shape[0] / in_h
+        stride_w = self.input_shape[1] / in_w
+        # -------------------------------------------------#
+        #   把anchor转换到此时获得的scaled_anchors大小是相对于特征层的
+        # -------------------------------------------------#
+        scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]  # 把anchor也缩放到与输出特征图相同尺度
+        # -----------------------------------------------#
+        #   输入的input一共有三个，他们的shape分别是
+        #   bs, 3*(5+num_classes), 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
+        #   batch_size, 3, 26, 26, 5 + num_classes
+        #   batch_size, 3, 52, 52, 5 + num_classes
+        # -----------------------------------------------#
+        prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(
+            0, 1, 3, 4, 2).contiguous()  # batch_size, 3种anchor, h, w, 单个anchor对应的25个输出值
+
+        # -----------------------------------------------#
+        #   先验框的中心位置的调整参数
+        # -----------------------------------------------#
+        x = torch.sigmoid(prediction[..., 0])  # prediction[..., 0]  维度是8, 3, 13, 13   取tx坐标
+        y = torch.sigmoid(prediction[..., 1])  # ty
+        # -----------------------------------------------#
+        #   先验框的宽高调整参数
+        # -----------------------------------------------#
+        w = prediction[..., 2]  # tw
+        h = prediction[..., 3]  # th
+        # -----------------------------------------------#
+        #   获得置信度，是否有物体
+        # -----------------------------------------------#
+        conf = torch.sigmoid(prediction[..., 4])  # prediction[..., 4] 是否有目标
+        # -----------------------------------------------#
+        #   种类置信度
+        # -----------------------------------------------#
+        pred_cls = torch.sigmoid(prediction[..., 5:])
+
+        # -----------------------------------------------#
+        #   获得网络应该有的预测结果  y_true是重新建立的真实标签 8, 3, 13, 13, 25.  noobj_mask中有目标为0，其他为1.  box_loss_scale记录了面积
+        # -----------------------------------------------#
+        y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)
+        # y_true中，是用 真实框转换为 与网络输出一致的格式。比如，坐标是在输出特征分辨率下的，类别是真实框所在的cell对应的类别。
+        # ---------------------------------------------------------------#
+        #   将预测结果进行解码，判断预测结果和真实值的重合程度
+        #   如果重合程度过大则忽略，因为这些特征点属于预测比较准确的特征点
+        #   作为负样本不合适  # l在这里是三个多尺度特征图的第几个  pred_boxes是生成的网络预测的结果
+        # ----------------------------------------------------------------#
+        noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
+
+        if self.cuda:
+            y_true = y_true.type_as(x)
+            noobj_mask = noobj_mask.type_as(x)
+            box_loss_scale = box_loss_scale.type_as(x)
+        # --------------------------------------------------------------------------#
+        #   box_loss_scale是真实框宽高的乘积，宽高均在0-1之间，因此乘积也在0-1之间。
+        #   2-宽高的乘积代表真实框越大，比重越小，小框的比重更大。
+        # --------------------------------------------------------------------------#
+        box_loss_scale = 2 - box_loss_scale
+
+        loss = 0
+        obj_mask = y_true[..., 4] == 1
+        n = torch.sum(obj_mask)
+        if n != 0:
+            if self.giou:
+                # ---------------------------------------------------------------#
+                #   计算预测结果和真实结果的giou
+                # ----------------------------------------------------------------#
+                giou = self.box_giou(pred_boxes, y_true[..., :4]).type_as(x)
+                loss_loc = torch.mean((1 - giou)[obj_mask])  # 这里用的GIOU 作为定位误差，而不是论文中的MSE
+            else:
+                # -----------------------------------------------------------#
+                #   计算中心偏移情况的loss，使用BCELoss效果好一些
+                # -----------------------------------------------------------#
+                loss_x = torch.mean(self.BCELoss(x[obj_mask], y_true[..., 0][obj_mask]) * box_loss_scale[obj_mask])
+                loss_y = torch.mean(self.BCELoss(y[obj_mask], y_true[..., 1][obj_mask]) * box_loss_scale[obj_mask])
+                # -----------------------------------------------------------#
+                #   计算宽高调整值的loss
+                # -----------------------------------------------------------#
+                loss_w = torch.mean(self.MSELoss(w[obj_mask], y_true[..., 2][obj_mask]) * box_loss_scale[obj_mask])
+                loss_h = torch.mean(self.MSELoss(h[obj_mask], y_true[..., 3][obj_mask]) * box_loss_scale[obj_mask])
+                loss_loc = (loss_x + loss_y + loss_h + loss_w) * 0.1
+            # pred_cls[obj_mask] 有目标的框数*   20个属性值（20个分类）
+            loss_cls = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask]))  # 目标的分类误差
+            loss += loss_loc * self.box_ratio + loss_cls * self.cls_ratio
+
+        loss_conf = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask])  # 忽略掉部分重叠高的但不是最匹配的预测框  的是否有目标的误差
+        loss += loss_conf * self.balance[l] * self.obj_ratio  # self.balance[l]不同层的权重不一样 [0.4, 1.0, 4]  表示对小目标损失权重更大
+        # if n != 0:
+        #     print(loss_loc * self.box_ratio, loss_cls * self.cls_ratio, loss_conf * self.balance[l] * self.obj_ratio)
+        return loss
+
+    def calculate_iou(self, _box_a, _box_b):
+        # -----------------------------------------------------------#
+        #   计算真实框的左上角和右下角  以0,0为中心点，计算左上角和右下角
+        # -----------------------------------------------------------#
+        b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
+        b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
+        # -----------------------------------------------------------#
+        #   计算先验框获得的预测框的左上角和右下角
+        # -----------------------------------------------------------#
+        b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
+        b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2
+
+        # -----------------------------------------------------------#
+        #   将真实框和预测框都转化成左上角右下角的形式
+        # -----------------------------------------------------------#
+        box_a = torch.zeros_like(_box_a)
+        box_b = torch.zeros_like(_box_b)
+        box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
+        box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2
+
+        # ----------------------------------------------------------- #
+        #   A为真实框的数量，B为先验框的数量
+        # ----------------------------------------------------------- #
+        A = box_a.size(0)
+        B = box_b.size(0)
+
+        # ----------------------------------------------------------- #
+        #   计算交的面积  box_a是真实框，左上角和右下角。 box_b是先验框的左上角和右下角
+        #   box_a[:, 2:].unsqueeze(1).expand(A, B, 2) 从 5, 1, 2 扩展到5, 9, 2。 这里的5是图中框的数量。每一个组有9个，5个框重复9次
+        #   box_b[:, 2:].unsqueeze(0).expand(A, B, 2) 从 1, 9, 2 扩展到5, 9, 2。 这里的每一个组9个是不一样的9个anchor框，重复5次。
+        # ----------------------------------------------------------- #
+
+        # 每一个gt复制 len(anchors)次，然后与所有anchors比较
+        max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),  # 计算右下角的最小点
+                           box_b[:, 2:].unsqueeze(0).expand(A, B, 2))  # 输出 5，9，2
+        min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),  # 计算左上角的最大点
+                           box_b[:, :2].unsqueeze(0).expand(A, B, 2))  # 输出 5，9，2
+        inter = torch.clamp((max_xy - min_xy),  # 这里无法判断两个框不相交的情况。但不相交 U 就大，所以应该不影响结果
+                            min=0)  # 最小值是0，最大值不限。相减之后，得到宽和高。# input：输入张量  min：范围的最小值，如果不指定的话，会默认无下界 max：范围的最大值，如果不指定的话，会默认无上界
+        inter = inter[:, :, 0] * inter[:, :, 1]  # 每个真实框与锚框  相交的面积
+        # ----------------------------------------------------------- #
+        #   计算预测框和真实框各自的面积
+        # ----------------------------------------------------------- #
+        area_a = ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(
+            inter)  # [A,B]  5个值，重复9次
+        area_b = ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(
+            inter)  # [A,B]  9个值，重复5次
+        # ----------------------------------------------------------- #
+        #   求IOU
+        # ----------------------------------------------------------- #
+        union = area_a + area_b - inter
+        return inter / union  # [A,B]
+
+    def get_target(self, l, targets, anchors, in_h, in_w):
+        # -----------------------------------------------------#
+        #   计算一共有多少张图片
+        # -----------------------------------------------------#
+        bs = len(targets)
+        # -----------------------------------------------------#
+        #   对每一个grid cell，都需要标记。用于选取哪些先验框不包含物体
+        # -----------------------------------------------------#
+        noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad=False)
+        # -----------------------------------------------------#
+        #   让网络更加去关注小目标
+        # -----------------------------------------------------#
+        box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad=False)
+        # -----------------------------------------------------#
+        #   batch_size, 3, 13, 13, 5 + num_classes
+        # -----------------------------------------------------#
+        y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad=False)
+        for b in range(bs):  # 每张图片单独计算
+            if len(targets[b]) == 0:  # targets是真实框
+                continue
+            batch_target = torch.zeros_like(targets[b])  # 把0~1之间的targets转换到 特征图大小的 targets
+            # -------------------------------------------------------#
+            #   计算出正样本在特征层上的中心点  # box第0，1维记录中心点 box第2，3维记录宽高  # 这里不知道为何这样做，但结果一样的
+            # -------------------------------------------------------#
+            batch_target[:, [0, 2]] = targets[b][:, [0, 2]] * in_w  # 从归一化的box中反解出在 13*13 分辨率下的大小 两个 x 坐标
+            batch_target[:, [1, 3]] = targets[b][:, [1, 3]] * in_h
+            batch_target[:, 4] = targets[b][:, 4]
+            batch_target = batch_target.cpu()  # 因为是从targets（放在cuda上）中复制过来的，所以需要执行一次cpu()
+
+            # -------------------------------------------------------#
+            #   将真实框转换一个形式   相当于都放到0, 0, w, h  进行比较
+            #   num_true_box, 4  # 把2，3 维，也就是宽和高取出，前面拼两个0
+            # -------------------------------------------------------#
+            gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
+            # -------------------------------------------------------#
+            #   将先验框转换一个形式
+            #   9, 4    在先验框大小前面加了两个0
+            # -------------------------------------------------------#
+            anchor_shapes = torch.FloatTensor(
+                torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
+            # -------------------------------------------------------#
+            #   计算交并比
+            #   self.calculate_iou(gt_box, anchor_shapes) = [num_true_box, 9]每一个真实框和9个先验框的重合情况
+            #   best_ns:
+            #   [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]   # self.calculate_iou(gt_box, anchor_shapes) 的结果，是  b x len(anchors)
+            # -------------------------------------------------------#
+            best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)  # 找到每个真实框与所有anchor的IoU，然后取出每个真实框最匹配的anchor下标
+            # 依次遍历每个真实框对应的anchor号数，找到在  所属当前层的3个anchor中的下标
+            for t, best_n in enumerate(best_ns):  # l是最后输出的多层特征图第几层
+                if best_n not in self.anchors_mask[l]:  # self.anchors_mask的用法是指定当前特征图用的是哪3个anchor
+                    continue
+                # ----------------------------------------#
+                #   判断这个先验框是当前特征点的哪一个先验框   l是第几号最后的输出特征图
+                # ----------------------------------------#
+                k = self.anchors_mask[l].index(best_n)  # 使用当前层对应anchors的第几号anchor
+                # ----------------------------------------#
+                #   获得真实框属于哪个网格点  获取中心点。因为映射到了13*13分辨率上。  floor不就是左上角的意思？
+                # ----------------------------------------#
+                i = torch.floor(batch_target[t, 0]).long()  # t 表示当前是第几个真实框
+                j = torch.floor(batch_target[t, 1]).long()
+                # ----------------------------------------#
+                #   取出真实框的种类
+                # ----------------------------------------#
+                c = batch_target[t, 4].long()
+
+                # ----------------------------------------#
+                #   noobj_mask代表无目标的特征点  b是几号batch，k是几号anchor
+                # ----------------------------------------#
+                noobj_mask[b, k, j, i] = 0
+                # ----------------------------------------#
+                #   tx、ty代表中心调整参数的真实值
+                # ----------------------------------------#
+                if not self.giou:  # 不走这条分支
+                    # ----------------------------------------#
+                    #   tx、ty代表中心调整参数的真实值
+                    # ----------------------------------------#
+                    y_true[b, k, j, i, 0] = batch_target[t, 0] - i.float()
+                    y_true[b, k, j, i, 1] = batch_target[t, 1] - j.float()
+                    y_true[b, k, j, i, 2] = math.log(batch_target[t, 2] / anchors[best_n][0])
+                    y_true[b, k, j, i, 3] = math.log(batch_target[t, 3] / anchors[best_n][1])
+                    y_true[b, k, j, i, 4] = 1
+                    y_true[b, k, j, i, c + 5] = 1  # 重新设置标记种类
+                else:
+                    # ----------------------------------------#
+                    #   tx、ty代表中心调整参数的真实值　　重新生成的标签 y_true  t是当前的图像的第t个真实框
+                    # ----------------------------------------#
+                    y_true[b, k, j, i, 0] = batch_target[t, 0]
+                    y_true[b, k, j, i, 1] = batch_target[t, 1]
+                    y_true[b, k, j, i, 2] = batch_target[t, 2]
+                    y_true[b, k, j, i, 3] = batch_target[t, 3]
+                    y_true[b, k, j, i, 4] = 1  # 有物体
+                    y_true[b, k, j, i, c + 5] = 1  # c是种类
+                # ----------------------------------------#
+                #   用于获得xywh的比例
+                #   大目标loss权重小，小目标loss权重大
+                # ----------------------------------------#
+                box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h  # 这里计算出面积，能反应大小目标。又归一化到0~1之间。
+        return y_true, noobj_mask, box_loss_scale
+
+    def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
+        # -----------------------------------------------------#
+        #   计算一共有多少张图片
+        # -----------------------------------------------------#
+        bs = len(targets)
+
+        # -----------------------------------------------------#
+        #   生成网格，先验框中心，网格左上角  torch.linspace(0, in_w - 1, in_w) 在0, in_w - 1之间分成in_w个点。.repeat(in_h, 1)沿0重复in_h次，沿1重复1次
+        # -----------------------------------------------------#
+        grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
+            int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)  # 这样写 repeat 比较清晰。repeat从右向左分析比较清晰。后两维是沿着竖轴和横轴重复指定次数。
+        grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
+            int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)
+
+        # 生成先验框的宽高
+        scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]  # 取出对应的3个先验框的具体值
+        anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)  # 沿1维度，找到第几维值
+        anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
+
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)  # 13*13 个一样的，形成一组。3个不一样的13*13。 x8次
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
+        # -------------------------------------------------------#
+        #   计算调整后的先验框中心与宽高  x是输出的第0属性，就是x的sigmoid的输出坐标
+        # -------------------------------------------------------#
+        pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
+        pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
+        pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
+        pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
+        pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim=-1)
+
+        for b in range(bs):  # 对一个 batch 里的数据  一张张图像  分别进行操作
+            # -------------------------------------------------------#
+            #   将预测结果转换一个形式
+            #   pred_boxes_for_ignore      num_anchors, 4
+            # -------------------------------------------------------#
+            pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
+            # -------------------------------------------------------#
+            #   计算真实框，并把真实框转换成相对于特征层的大小
+            #   gt_box      num_true_box, 4
+            # -------------------------------------------------------#
+            if len(targets[b]) > 0:  # 如果有目标，进行下面的操作。否则 跳到下一张图片。
+                batch_target = torch.zeros_like(targets[b])
+                # -------------------------------------------------------#
+                #   计算出正样本在特征层上的中心点  # 这里地方好像也是把 box当前左上角和右下角的形式，实现已经变成了中心点与宽高的形式。但无论如何，最终的结果没变。
+                # -------------------------------------------------------#
+                batch_target[:, [0, 2]] = targets[b][:, [0, 2]] * in_w
+                batch_target[:, [1, 3]] = targets[b][:, [1, 3]] * in_h
+                batch_target = batch_target[:, :4].type_as(x)
+                # -------------------------------------------------------#
+                #   计算交并比
+                #   anch_ious       num_true_box, num_anchors
+                # -------------------------------------------------------#
+                anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)  # 真实框与预测框的IoU
+                # -------------------------------------------------------#
+                #   每个先验框???对应真实框的最大重合度
+                #   anch_ious_max   num_anchors
+                # -------------------------------------------------------#
+                anch_ious_max, _ = torch.max(anch_ious, dim=0)  # 每个真实框与预测框的最大值。
+                anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
+                noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0  # 如果大于某个阈值，即使不是最匹配的，也可以忽略这个cell。所以noobj设置为0。
+        return noobj_mask, pred_boxes
+
+
+def weights_init(net, init_type='normal', init_gain=0.02):
+    def init_func(m):
+        classname = m.__class__.__name__
+        if hasattr(m, 'weight') and classname.find('Conv') != -1:
+            if init_type == 'normal':
+                torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
+            elif init_type == 'xavier':
+                torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
+            elif init_type == 'kaiming':
+                torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+            elif init_type == 'orthogonal':
+                torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
+            else:
+                raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
+        elif classname.find('BatchNorm2d') != -1:
+            torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
+            torch.nn.init.constant_(m.bias.data, 0.0)
+
+    print('initialize network with %s type' % init_type)
+    net.apply(init_func)
+
+
+def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio=0.05, warmup_lr_ratio=0.1,
+                     no_aug_iter_ratio=0.05, step_num=10):
+    def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters):
+        if iters <= warmup_total_iters:
+            # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+            lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
+        elif iters >= total_iters - no_aug_iter:
+            lr = min_lr
+        else:
+            lr = min_lr + 0.5 * (lr - min_lr) * (
+                    1.0 + math.cos(
+                math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iter))
+            )
+        return lr
+
+    def step_lr(lr, decay_rate, step_size, iters):
+        if step_size < 1:
+            raise ValueError("step_size must above 1.")
+        n = iters // step_size
+        out_lr = lr * decay_rate ** n
+        return out_lr
+
+    if lr_decay_type == "cos":
+        warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3)
+        warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6)
+        no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15)
+        func = partial(yolox_warm_cos_lr, lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter)
+    else:
+        decay_rate = (min_lr / lr) ** (1 / (step_num - 1))
+        step_size = total_iters / step_num
+        func = partial(step_lr, lr, decay_rate, step_size)
+
+    return func
+
+
+def set_optimizer_lr(optimizer, lr_scheduler_func, epoch):
+    lr = lr_scheduler_func(epoch)
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
--- a/patch_config.py
+++ b/patch_config.py
@ -0,0 +1,135 @@
+from torch import optim
+
+
+class BaseConfig(object):
+    """
+    Default parameters for all config files.
+    """
+
+    def __init__(self):
+        """
+        Set the defaults.
+        """
+        # self.img_dir = "inria/Train/pos"
+        # self.lab_dir = "inria/Train/pos/yolo-labels"
+        self.img_dir = "cctsdb/Train/pos"
+        self.lab_dir = "cctsdb/Train/labels"
+        self.cfgfile = "cfg/yolo.cfg"
+        self.weightfile = "weights/yolo.weights"
+        self.printfile = "non_printability/30values.txt"
+        self.patch_size = 300
+
+        self.start_learning_rate = 0.03
+
+        self.patch_name = 'base'
+
+        self.scheduler_factory = lambda x: optim.lr_scheduler.ReduceLROnPlateau(x, 'min', patience=50)
+        self.max_tv = 0
+
+        self.batch_size = 20
+
+        self.loss_target = lambda obj, cls: obj * cls
+
+
+class Experiment1(BaseConfig):
+    """
+    Model that uses a maximum total variation, tv cannot go below this point.
+    """
+
+    def __init__(self):
+        """
+        Change stuff...
+        """
+        super().__init__()
+
+        self.patch_name = 'Experiment1'
+        self.max_tv = 0.165
+
+
+class Experiment2HighRes(Experiment1):
+    """
+    Higher res
+    """
+
+    def __init__(self):
+        """
+        Change stuff...
+        """
+        super().__init__()
+
+        self.max_tv = 0.165
+        self.patch_size = 400
+        self.patch_name = 'Exp2HighRes'
+
+
+class Experiment3LowRes(Experiment1):
+    """
+    Lower res
+    """
+
+    def __init__(self):
+        """
+        Change stuff...
+        """
+        super().__init__()
+
+        self.max_tv = 0.165
+        self.patch_size = 100
+        self.patch_name = "Exp3LowRes"
+
+
+class Experiment4ClassOnly(Experiment1):
+    """
+    Only minimise class score.
+    """
+
+    def __init__(self):
+        """
+        Change stuff...
+        """
+        super().__init__()
+
+        self.patch_name = 'Experiment4ClassOnly'
+        self.loss_target = lambda obj, cls: cls
+
+
+class Experiment1Desktop(Experiment1):
+    """
+    """
+
+    def __init__(self):
+        """
+        Change batch size.
+        """
+        super().__init__()
+
+        self.batch_size = 8
+        self.patch_size = 400
+
+
+class ReproducePaperObj(BaseConfig):
+    """
+    Reproduce the results from the paper: Generate a patch that minimises object score.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.batch_size = 8
+        self.patch_size = 300
+
+        self.patch_name = 'ObjectOnlyPaper'
+        self.max_tv = 0.165
+
+        self.loss_target = lambda obj, cls: obj
+
+
+patch_configs = {
+    "base": BaseConfig,
+    "exp1": Experiment1,
+    "exp1_des": Experiment1Desktop,
+    "exp2_high_res": Experiment2HighRes,
+    "exp3_low_res": Experiment3LowRes,
+    "exp4_class_only": Experiment4ClassOnly,
+    "paper_obj": ReproducePaperObj
+}
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,181 @@
+# -----------------------------------------------------------------------#
+#   predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能
+#   整合到了一个py文件中，通过指定mode进行模式的修改。
+# -----------------------------------------------------------------------#
+import time
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from yolo import YOLO
+
+if __name__ == "__main__":
+    yolo = YOLO()
+    # ----------------------------------------------------------------------------------------------------------#
+    #   mode用于指定测试的模式：
+    #   'predict'           表示单张图片预测，如果想对预测过程进行修改，如保存图片，截取对象等，可以先看下方详细的注释
+    #   'video'             表示视频检测，可调用摄像头或者视频进行检测，详情查看下方注释。
+    #   'fps'               表示测试fps，使用的图片是img里面的street.jpg，详情查看下方注释。
+    #   'dir_predict'       表示遍历文件夹进行检测并保存。默认遍历img文件夹，保存img_out文件夹，详情查看下方注释。
+    #   'heatmap'           表示进行预测结果的热力图可视化，详情查看下方注释。
+    #   'export_onnx'       表示将模型导出为onnx，需要pytorch1.7.1以上。
+    # ----------------------------------------------------------------------------------------------------------#
+    mode = "predict"
+    # -------------------------------------------------------------------------#
+    #   crop                指定了是否在单张图片预测后对目标进行截取
+    #   count               指定了是否进行目标的计数
+    #   crop、count仅在mode='predict'时有效
+    # -------------------------------------------------------------------------#
+    crop = False
+    count = False
+    # ----------------------------------------------------------------------------------------------------------#
+    #   video_path          用于指定视频的路径，当video_path=0时表示检测摄像头
+    #                       想要检测视频，则设置如video_path = "xxx.mp4"即可，代表读取出根目录下的xxx.mp4文件。
+    #   video_save_path     表示视频保存的路径，当video_save_path=""时表示不保存
+    #                       想要保存视频，则设置如video_save_path = "yyy.mp4"即可，代表保存为根目录下的yyy.mp4文件。
+    #   video_fps           用于保存的视频的fps
+    #
+    #   video_path、video_save_path和video_fps仅在mode='video'时有效
+    #   保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
+    # ----------------------------------------------------------------------------------------------------------#
+    video_path = 0
+    video_save_path = ""
+    video_fps = 25.0
+    # ----------------------------------------------------------------------------------------------------------#
+    #   test_interval       用于指定测量fps的时候，图片检测的次数。理论上test_interval越大，fps越准确。
+    #   fps_image_path      用于指定测试的fps图片
+    #   
+    #   test_interval和fps_image_path仅在mode='fps'有效
+    # ----------------------------------------------------------------------------------------------------------#
+    test_interval = 100
+    fps_image_path = "img/street.jpg"
+    # -------------------------------------------------------------------------#
+    #   dir_origin_path     指定了用于检测的图片的文件夹路径
+    #   dir_save_path       指定了检测完图片的保存路径
+    #   
+    #   dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
+    # -------------------------------------------------------------------------#
+    dir_origin_path = "img/"
+    dir_save_path = "img_out/"
+    # -------------------------------------------------------------------------#
+    #   heatmap_save_path   热力图的保存路径，默认保存在model_data下
+    #   
+    #   heatmap_save_path仅在mode='heatmap'有效
+    # -------------------------------------------------------------------------#
+    heatmap_save_path = "model_data/heatmap_vision.png"
+    # -------------------------------------------------------------------------#
+    #   simplify            使用Simplify onnx
+    #   onnx_save_path      指定了onnx的保存路径
+    # -------------------------------------------------------------------------#
+    simplify = True
+    onnx_save_path = "model_data/models.onnx"
+
+    if mode == "predict":
+        '''
+        1、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
+        2、如果想要获得预测框的坐标，可以进入yolo.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
+        3、如果想要利用预测框截取下目标，可以进入yolo.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
+        在原图上利用矩阵的方式进行截取。
+        4、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入yolo.detect_image函数，在绘图部分对predicted_class进行判断，
+        比如判断if predicted_class == 'car': 即可判断当前目标是否为车，然后记录数量即可。利用draw.text即可写字。
+        '''
+        while True:
+            img = input('Input image filename:')
+            # img/street.jpg
+            # img/street_a3.jpg
+            try:
+                image = Image.open(img)
+            except:
+                print('Open Error! Try again!')
+                continue
+            else:
+                r_image = yolo.detect_image(image, crop=crop, count=count)
+                # r_image.show()
+                r_image.save("duffision.png")
+
+    elif mode == "video":
+        capture = cv2.VideoCapture(video_path)
+        if video_save_path != "":
+            fourcc = cv2.VideoWriter_fourcc(*'XVID')
+            size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+            out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
+
+        ref, frame = capture.read()
+        if not ref:
+            raise ValueError("未能正确读取摄像头（视频），请注意是否正确安装摄像头（是否正确填写视频路径）。")
+
+        fps = 0.0
+        while (True):
+            t1 = time.time()
+            # 读取某一帧
+            ref, frame = capture.read()
+            if not ref:
+                break
+            # 格式转变，BGRtoRGB
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            # 转变成Image
+            frame = Image.fromarray(np.uint8(frame))
+            # 进行检测
+            frame = np.array(yolo.detect_image(frame))
+            # RGBtoBGR满足opencv显示格式
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+
+            fps = (fps + (1. / (time.time() - t1))) / 2
+            print("fps= %.2f" % (fps))
+            frame = cv2.putText(frame, "fps= %.2f" % (fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+
+            cv2.imshow("video", frame)
+            c = cv2.waitKey(1) & 0xff
+            if video_save_path != "":
+                out.write(frame)
+
+            if c == 27:
+                capture.release()
+                break
+
+        print("Video Detection Done!")
+        capture.release()
+        if video_save_path != "":
+            print("Save processed video to the path :" + video_save_path)
+            out.release()
+        cv2.destroyAllWindows()
+
+    elif mode == "fps":
+        img = Image.open(fps_image_path)
+        tact_time = yolo.get_FPS(img, test_interval)
+        print(str(tact_time) + ' seconds, ' + str(1 / tact_time) + 'FPS, @batch_size 1')
+
+    elif mode == "dir_predict":
+        import os
+
+        from tqdm import tqdm
+
+        img_names = os.listdir(dir_origin_path)
+        for img_name in tqdm(img_names):
+            if img_name.lower().endswith(
+                    ('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
+                image_path = os.path.join(dir_origin_path, img_name)
+                image = Image.open(image_path)
+                r_image = yolo.detect_image(image)
+                if not os.path.exists(dir_save_path):
+                    os.makedirs(dir_save_path)
+                r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)
+
+    elif mode == "heatmap":
+        while True:
+            img = input('Input image filename:')
+            try:
+                image = Image.open(img)
+            except:
+                print('Open Error! Try again!')
+                continue
+            else:
+                yolo.detect_heatmap(image, heatmap_save_path)
+
+    elif mode == "export_onnx":
+        yolo.convert_to_onnx(simplify, onnx_save_path)
+
+    else:
+        raise AssertionError(
+            "Please specify the correct mode: 'predict', 'video', 'fps', 'heatmap', 'export_onnx', 'dir_predict'.")
--- a/predict_with_windows.py
+++ b/predict_with_windows.py
@ -0,0 +1,109 @@
+import time
+
+import pyautogui
+import cv2
+import numpy as np
+from PIL import Image
+
+from yolo import YOLO
+
+if __name__ == "__main__":
+    yolo = YOLO()
+    # ----------------------------------------------------------------------------------------------------------#
+    #   mode用于指定测试的模式：
+    #   'predict'           表示单张图片预测，如果想对预测过程进行修改，如保存图片，截取对象等，可以先看下方详细的注释
+    #   'video'             表示视频检测，可调用摄像头或者视频进行检测，详情查看下方注释。
+    #   'fps'               表示测试fps，使用的图片是img里面的street.jpg，详情查看下方注释。
+    #   'dir_predict'       表示遍历文件夹进行检测并保存。默认遍历img文件夹，保存img_out文件夹，详情查看下方注释。
+    #   'heatmap'           表示进行预测结果的热力图可视化，详情查看下方注释。
+    #   'export_onnx'       表示将模型导出为onnx，需要pytorch1.7.1以上。
+    # ----------------------------------------------------------------------------------------------------------#
+    mode = "predict"
+    # -------------------------------------------------------------------------#
+    #   crop                指定了是否在单张图片预测后对目标进行截取
+    #   count               指定了是否进行目标的计数
+    #   crop、count仅在mode='predict'时有效
+    # -------------------------------------------------------------------------#
+    crop = False
+    count = False
+    # ----------------------------------------------------------------------------------------------------------#
+    #   video_path          用于指定视频的路径，当video_path=0时表示检测摄像头
+    #                       想要检测视频，则设置如video_path = "xxx.mp4"即可，代表读取出根目录下的xxx.mp4文件。
+    #   video_save_path     表示视频保存的路径，当video_save_path=""时表示不保存
+    #                       想要保存视频，则设置如video_save_path = "yyy.mp4"即可，代表保存为根目录下的yyy.mp4文件。
+    #   video_fps           用于保存的视频的fps
+    #
+    #   video_path、video_save_path和video_fps仅在mode='video'时有效
+    #   保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
+    # ----------------------------------------------------------------------------------------------------------#
+    video_path = 0
+    video_save_path = ""
+    video_fps = 25.0
+    # ----------------------------------------------------------------------------------------------------------#
+    #   test_interval       用于指定测量fps的时候，图片检测的次数。理论上test_interval越大，fps越准确。
+    #   fps_image_path      用于指定测试的fps图片
+    #
+    #   test_interval和fps_image_path仅在mode='fps'有效
+    # ----------------------------------------------------------------------------------------------------------#
+    test_interval = 100
+    fps_image_path = "img/street.jpg"
+    # -------------------------------------------------------------------------#
+    #   dir_origin_path     指定了用于检测的图片的文件夹路径
+    #   dir_save_path       指定了检测完图片的保存路径
+    #
+    #   dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
+    # -------------------------------------------------------------------------#
+    dir_origin_path = "img/"
+    dir_save_path = "img_out/"
+    # -------------------------------------------------------------------------#
+    #   heatmap_save_path   热力图的保存路径，默认保存在model_data下
+    #
+    #   heatmap_save_path仅在mode='heatmap'有效
+    # -------------------------------------------------------------------------#
+    heatmap_save_path = "model_data/heatmap_vision.png"
+    # -------------------------------------------------------------------------#
+    #   simplify            使用Simplify onnx
+    #   onnx_save_path      指定了onnx的保存路径
+    # -------------------------------------------------------------------------#
+    simplify = True
+    onnx_save_path = "model_data/models.onnx"
+
+    if mode == "predict":
+        '''
+        1、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
+        2、如果想要获得预测框的坐标，可以进入yolo.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
+        3、如果想要利用预测框截取下目标，可以进入yolo.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
+        在原图上利用矩阵的方式进行截取。
+        4、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入yolo.detect_image函数，在绘图部分对predicted_class进行判断，
+        比如判断if predicted_class == 'car': 即可判断当前目标是否为车，然后记录数量即可。利用draw.text即可写字。
+        '''
+        while True:
+            # img = pyautogui.screenshot(region=[300, 50, 200, 100])  # 分别代表：左上角坐标，宽高
+            # img = pyautogui.screenshot()  # 分别代表：左上角坐标，宽高
+            # 对获取的图片转换成二维矩阵形式，后再将RGB转成BGR
+            # 因为imshow,默认通道顺序是BGR，而pyautogui默认是RGB所以要转换一下，不然会有点问题
+            # img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
+            # img/street.jpg
+            # img/street_a3.jpg
+            try:
+                time.sleep(0.3)
+                # image = Image.fromarray(np.asarray(pyautogui.screenshot(region=[1920/2, 300, 1920/2, 1080])))
+                image = Image.fromarray(np.asarray(pyautogui.screenshot()))
+            except:
+                print('Open Error! Try again!')
+                continue
+            else:
+                r_image = yolo.detect_image(image, crop=crop, count=count)
+                img = cv2.cvtColor(np.asarray(r_image), cv2.COLOR_RGB2BGR)
+                # img = cv2.resize(img, dsize=(1600, 860))  # (宽度,高度)
+                img = cv2.resize(img, dsize=(1920, 1080))  # (宽度,高度)
+                cv2.imshow("screen", img)
+                # time.sleep(1)
+                cv2.waitKey(1)
+
+                c = cv2.waitKey(1) & 0xff
+                # print(c)
+                if c == 113:
+                    break
+
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+scipy==1.2.1
+numpy==1.17.0
+matplotlib==3.1.2
+opencv_python==4.1.2.30
+torch==1.2.0
+torchvision==0.4.0
+tqdm==4.60.0
+Pillow==8.2.0
+h5py==2.10.0
--- a/summary.py
+++ b/summary.py
@ -0,0 +1,34 @@
+# --------------------------------------------#
+#   该部分代码用于看网络结构
+# --------------------------------------------#
+import torch
+# from thop import clever_format, profile
+from torchsummary import summary
+
+from nets.yolo import YoloBody
+
+if __name__ == "__main__":
+    input_shape = [416, 416]
+    anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+    num_classes = 80
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    m = YoloBody(anchors_mask, num_classes)
+    print(m)
+    print('-' * 80)
+
+    m = m.to(device)
+    summary(m, (3, input_shape[0], input_shape[1]))
+
+    # dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device)
+    # flops, params = profile(m.to(device), (dummy_input,), verbose=False)
+    # --------------------------------------------------------#
+    #   flops * 2是因为profile没有将卷积作为两个operations
+    #   有些论文将卷积算乘法、加法两个operations。此时乘2
+    #   有些论文只考虑乘法的运算次数，忽略加法。此时不乘2
+    #   本代码选择乘2，参考YOLOX。
+    # --------------------------------------------------------#
+    # flops = flops * 2
+    # flops, params = clever_format([flops, params], "%.3f")
+    # print('Total GFLOPS: %s' % (flops))
+    # print('Total params: %s' % (params))
--- a/train_patch.py
+++ b/train_patch.py
@ -0,0 +1,225 @@
+"""
+Training code for Adversarial patch training
+
+
+"""
+
+import PIL
+from torch.utils.tensorboard import SummaryWriter
+
+# import load_data
+from tqdm import tqdm
+
+from load_data import *  # 可能导致多次导入问题？
+import gc
+import matplotlib.pyplot as plt
+from torch import autograd
+from torchvision import transforms
+
+import subprocess
+
+import patch_config
+import sys
+import time
+
+from yolo import YOLO
+
+
+class PatchTrainer(object):
+    def __init__(self, mode):
+        self.config = patch_config.patch_configs[mode]()  # 获取对应的配置类
+
+        # self.darknet_model = Darknet(self.config.cfgfile)  # 加载yolo模型
+        # self.darknet_model.load_weights(self.config.weightfile)  # 默认 YOLOv2 MS COCO weights， person编号是0
+        self.darknet_model = YOLO().net
+        self.darknet_model = self.darknet_model.eval().cuda()  # TODO: Why eval?
+        self.patch_applier = PatchApplier().cuda()  # 对图像应用对抗补丁
+        self.patch_transformer = PatchTransformer().cuda()  # 变换补丁到指定大小并产生抖动
+        # self.prob_extractor = MaxProbExtractor(0, 80, self.config).cuda()  # 提取最大类别概率
+        self.prob_extractor = MaxProbExtractor(0, 1, self.config).cuda()  # 提取最大类别概率
+        self.nps_calculator = NPSCalculator(self.config.printfile, self.config.patch_size).cuda()  # 不可打印分数
+        self.total_variation = TotalVariation().cuda()  # 计算补丁的所有变化程度
+
+        self.writer = self.init_tensorboard(mode)
+
+    def init_tensorboard(self, name=None):
+        subprocess.Popen(['tensorboard', '--logdir=runs'])
+        if name is not None:
+            time_str = time.strftime("%Y%m%d-%H%M%S")
+            return SummaryWriter(f'runs/{time_str}_{name}')
+        else:
+            return SummaryWriter()
+
+    def train(self):
+        """
+        Optimize a patch to generate an adversarial example.
+        :return: Nothing
+        """
+
+        img_size = self.darknet_model.height  # 416
+        # print('batch_size:',batch_size)
+        batch_size = self.config.batch_size  # 8
+        n_epochs = 200
+        # n_epochs = 5
+        # max_lab = 20  # label的最大长度
+        max_lab = 8
+
+        time_str = time.strftime("%Y%m%d-%H%M%S")
+
+        # Generate stating point
+        # adv_patch_cpu = self.generate_patch("gray")  # 生成一个灰图，初始化为0.5
+        adv_patch_cpu = self.read_image("saved_patches/patchnew0.jpg")
+
+        adv_patch_cpu.requires_grad_(True)
+
+        train_loader = torch.utils.data.DataLoader(
+            InriaDataset(self.config.img_dir, self.config.lab_dir, max_lab, img_size,
+                         shuffle=True),
+            batch_size=batch_size,
+            shuffle=True,
+            num_workers=0)  # 与 from load_data import * 搭配导致多少导入？
+        self.epoch_length = len(train_loader)
+        print(f'One epoch is {len(train_loader)}')
+
+        optimizer = optim.Adam([adv_patch_cpu], lr=self.config.start_learning_rate, amsgrad=True)  # 更新的是那个补丁
+        scheduler = self.config.scheduler_factory(optimizer)  # ICLR-2018年最佳论文提出的Adam改进版Amsgrad
+
+        et0 = time.time()
+        for epoch in range(n_epochs):
+            ep_det_loss = 0
+            ep_nps_loss = 0
+            ep_tv_loss = 0
+            ep_loss = 0
+            bt0 = time.time()
+            for i_batch, (img_batch, lab_batch) in tqdm(enumerate(train_loader), desc=f'Running epoch {epoch}',
+                                                        total=self.epoch_length):
+                with autograd.detect_anomaly():  # 1.运行前向时开启异常检测功能，则在反向时会打印引起反向失败的前向操作堆栈 2.反向计算出现“nan”时引发异常
+                    img_batch = img_batch.cuda()  # 8, 3, 416, 416
+                    lab_batch = lab_batch.cuda()  # 8, 14, 5  为什么要把人数的标签补到14?
+                    # print('TRAINING EPOCH %i, BATCH %i'%(epoch, i_batch))
+                    adv_patch = adv_patch_cpu.cuda()  # 3, 300, 300
+                    adv_batch_t = self.patch_transformer(adv_patch, lab_batch, img_size, do_rotate=True, rand_loc=False)
+                    p_img_batch = self.patch_applier(img_batch, adv_batch_t)
+                    p_img_batch = F.interpolate(p_img_batch,
+                                                (self.darknet_model.height, self.darknet_model.width))  # 确保和图片大小一致
+                    
+                    # print('++++++++++++p_img_batch:+++++++++++++',p_img_batch.shape)
+                    img = p_img_batch[1, :, :, ]
+                    img = transforms.ToPILImage()(img.detach().cpu())
+                    # img.show()
+
+                    outputs = self.darknet_model(p_img_batch)  # 输入8，3，416，416  输出8，425， 13， 13  ,其中425是5*(5+80)
+                    max_prob = 0
+                    nps = 0
+                    tv = 0
+                    for l in range(len(outputs)):  # 三组不同分辨率大小的输出特征分别计算
+                        output = outputs[l]
+                        max_prob += self.prob_extractor(output)
+                        nps += self.nps_calculator(adv_patch)
+                        tv += self.total_variation(adv_patch)
+
+                    nps_loss = nps * 0.01
+                    tv_loss = tv * 2.5
+                    det_loss = torch.mean(max_prob)  # 把人的置值度当成损失
+                    loss = det_loss + nps_loss + torch.max(tv_loss, torch.tensor(0.1).cuda())
+
+                    ep_det_loss += det_loss.detach().cpu().numpy()
+                    ep_nps_loss += nps_loss.detach().cpu().numpy()
+                    ep_tv_loss += tv_loss.detach().cpu().numpy()
+                    ep_loss += loss
+
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    adv_patch_cpu.data.clamp_(0, 1)  # keep patch in image range
+
+                    bt1 = time.time()
+                    if i_batch % 5 == 0:
+                        iteration = self.epoch_length * epoch + i_batch
+
+                        self.writer.add_scalar('total_loss', loss.detach().cpu().numpy(), iteration)
+                        self.writer.add_scalar('loss/det_loss', det_loss.detach().cpu().numpy(), iteration)
+                        self.writer.add_scalar('loss/nps_loss', nps_loss.detach().cpu().numpy(), iteration)
+                        self.writer.add_scalar('loss/tv_loss', tv_loss.detach().cpu().numpy(), iteration)
+                        self.writer.add_scalar('misc/epoch', epoch, iteration)
+                        self.writer.add_scalar('misc/learning_rate', optimizer.param_groups[0]["lr"], iteration)
+
+                        self.writer.add_image('patch', adv_patch_cpu, iteration)
+                    if i_batch + 1 >= len(train_loader):
+                        print('\n')
+                    else:
+                        del adv_batch_t, output, max_prob, det_loss, p_img_batch, nps_loss, tv_loss, loss
+                        torch.cuda.empty_cache()
+                    bt0 = time.time()
+            et1 = time.time()
+            ep_det_loss = ep_det_loss / len(train_loader)
+            ep_nps_loss = ep_nps_loss / len(train_loader)
+            ep_tv_loss = ep_tv_loss / len(train_loader)
+            ep_loss = ep_loss / len(train_loader)
+
+            # im = transforms.ToPILImage('RGB')(adv_patch_cpu)
+            # plt.imshow(im)
+            # plt.savefig(f'pics/{time_str}_{self.config.patch_name}_{epoch}.png')
+
+            scheduler.step(ep_loss)
+            if True:
+                print('  EPOCH NR: ', epoch),
+                print('EPOCH LOSS: ', ep_loss)
+                print('  DET LOSS: ', ep_det_loss)
+                print('  NPS LOSS: ', ep_nps_loss)
+                print('   TV LOSS: ', ep_tv_loss)
+                print('EPOCH TIME: ', et1 - et0)
+                # im = transforms.ToPILImage('RGB')(adv_patch_cpu)
+                # plt.imshow(im)
+                # plt.show()
+                # im.save("saved_patches/patchnew1.jpg")
+                im = transforms.ToPILImage('RGB')(adv_patch_cpu)
+                if epoch >= 3:
+                    im.save(f"saved_patches/patchnew1_t1_{epoch}_{time_str}.jpg")
+                del adv_batch_t, output, max_prob, det_loss, p_img_batch, nps_loss, tv_loss, loss
+                torch.cuda.empty_cache()
+            et0 = time.time()
+
+    def generate_patch(self, type):
+        """
+        Generate a random patch as a starting point for optimization.
+
+        :param type: Can be 'gray' or 'random'. Whether or not generate a gray or a random patch.
+        :return:
+        """
+        if type == 'gray':
+            adv_patch_cpu = torch.full((3, self.config.patch_size, self.config.patch_size), 0.5)
+        elif type == 'random':
+            adv_patch_cpu = torch.rand((3, self.config.patch_size, self.config.patch_size))
+
+        return adv_patch_cpu
+
+    def read_image(self, path):
+        """
+        Read an input image to be used as a patch
+
+        :param path: Path to the image to be read.
+        :return: Returns the transformed patch as a pytorch Tensor.
+        """
+        patch_img = Image.open(path).convert('RGB')
+        tf = transforms.Resize((self.config.patch_size, self.config.patch_size))
+        patch_img = tf(patch_img)
+        tf = transforms.ToTensor()
+
+        adv_patch_cpu = tf(patch_img)
+        return adv_patch_cpu
+
+
+def main():
+    if len(sys.argv) != 2:
+        print('You need to supply (only) a configuration mode.')
+        print('Possible modes are:')
+        print(patch_config.patch_configs)  # 一般传入paper_obj
+ 
+    # print('sys.argv:',sys.argv)
+    trainer = PatchTrainer(sys.argv[1])
+    trainer.train()
+
+
+if __name__ == '__main__':
+    main()
--- a/utils/init.py
+++ b/utils/init.py
@ -0,0 +1 @@
+#
--- a/utils/callbacks.py
+++ b/utils/callbacks.py
@ -0,0 +1,241 @@
+import datetime
+import os
+
+import torch
+import matplotlib
+
+import scipy.signal
+from matplotlib import pyplot as plt
+from torch.utils.tensorboard import SummaryWriter
+
+import shutil
+import numpy as np
+
+from PIL import Image
+from tqdm import tqdm
+from .utils import cvtColor, preprocess_input, resize_image
+from .utils_bbox import DecodeBox
+from .utils_map import get_coco_map, get_map
+
+matplotlib.use('Agg')
+
+
+class LossHistory():
+    def __init__(self, log_dir, model, input_shape):
+        self.log_dir = log_dir
+        self.losses = []
+        self.val_loss = []
+
+        os.makedirs(self.log_dir)
+        self.writer = SummaryWriter(self.log_dir)
+        try:
+            dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1])
+            self.writer.add_graph(model, dummy_input)
+        except:
+            pass
+
+    def append_loss(self, epoch, loss, val_loss):
+        if not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+
+        self.losses.append(loss)
+        self.val_loss.append(val_loss)
+
+        with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f:
+            f.write(str(loss))
+            f.write("\n")
+        with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f:
+            f.write(str(val_loss))
+            f.write("\n")
+
+        self.writer.add_scalar('loss', loss, epoch)
+        self.writer.add_scalar('val_loss', val_loss, epoch)
+        self.loss_plot()
+
+    def loss_plot(self):
+        iters = range(len(self.losses))
+
+        plt.figure()
+        plt.plot(iters, self.losses, 'red', linewidth=2, label='train loss')
+        plt.plot(iters, self.val_loss, 'coral', linewidth=2, label='val loss')
+        try:
+            if len(self.losses) < 25:
+                num = 5
+            else:
+                num = 15
+
+            plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle='--', linewidth=2,
+                     label='smooth train loss')
+            plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle='--', linewidth=2,
+                     label='smooth val loss')
+        except:
+            pass
+
+        plt.grid(True)
+        plt.xlabel('Epoch')
+        plt.ylabel('Loss')
+        plt.legend(loc="upper right")
+
+        plt.savefig(os.path.join(self.log_dir, "epoch_loss.png"))
+
+        plt.cla()
+        plt.close("all")
+
+
+class EvalCallback():
+    def __init__(self, net, input_shape, anchors, anchors_mask, class_names, num_classes, val_lines, log_dir, cuda, \
+                 map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True,
+                 MINOVERLAP=0.5, eval_flag=True, period=1):
+        super(EvalCallback, self).__init__()
+
+        self.net = net
+        self.input_shape = input_shape
+        self.anchors = anchors
+        self.anchors_mask = anchors_mask
+        self.class_names = class_names
+        self.num_classes = num_classes
+        self.val_lines = val_lines
+        self.log_dir = log_dir
+        self.cuda = cuda
+        self.map_out_path = map_out_path
+        self.max_boxes = max_boxes
+        self.confidence = confidence
+        self.nms_iou = nms_iou
+        self.letterbox_image = letterbox_image
+        self.MINOVERLAP = MINOVERLAP
+        self.eval_flag = eval_flag
+        self.period = period
+
+        self.bbox_util = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]),
+                                   self.anchors_mask)
+
+        self.maps = [0]
+        self.epoches = [0]
+        if self.eval_flag:
+            with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
+                f.write(str(0))
+                f.write("\n")
+
+    def get_map_txt(self, image_id, image, class_names, map_out_path):
+        f = open(os.path.join(map_out_path, "detection-results/" + image_id + ".txt"), "w", encoding='utf-8')
+        image_shape = np.array(np.shape(image)[0:2])
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            # ---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            # ---------------------------------------------------------#
+            results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                         image_shape, self.letterbox_image, conf_thres=self.confidence,
+                                                         nms_thres=self.nms_iou)
+
+            if results[0] is None:
+                return
+
+            top_label = np.array(results[0][:, 6], dtype='int32')
+            top_conf = results[0][:, 4] * results[0][:, 5]
+            top_boxes = results[0][:, :4]
+
+        top_100 = np.argsort(top_label)[::-1][:self.max_boxes]
+        top_boxes = top_boxes[top_100]
+        top_conf = top_conf[top_100]
+        top_label = top_label[top_100]
+
+        for i, c in list(enumerate(top_label)):
+            predicted_class = self.class_names[int(c)]
+            box = top_boxes[i]
+            score = str(top_conf[i])
+
+            top, left, bottom, right = box
+            if predicted_class not in class_names:
+                continue
+
+            f.write("%s %s %s %s %s %s\n" % (
+                predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)), str(int(bottom))))
+
+        f.close()
+        return
+
+    def on_epoch_end(self, epoch, model_eval):
+        if epoch % self.period == 0 and self.eval_flag:
+            self.net = model_eval
+            if not os.path.exists(self.map_out_path):
+                os.makedirs(self.map_out_path)
+            if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")):
+                os.makedirs(os.path.join(self.map_out_path, "ground-truth"))
+            if not os.path.exists(os.path.join(self.map_out_path, "detection-results")):
+                os.makedirs(os.path.join(self.map_out_path, "detection-results"))
+            print("Get map.")
+            for annotation_line in tqdm(self.val_lines):
+                line = annotation_line.split()
+                image_id = os.path.basename(line[0]).split('.')[0]
+                # ------------------------------#
+                #   读取图像并转换成RGB图像
+                # ------------------------------#
+                image = Image.open(line[0])
+                # ------------------------------#
+                #   获得预测框
+                # ------------------------------#
+                gt_boxes = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
+                # ------------------------------#
+                #   获得预测txt
+                # ------------------------------#
+                self.get_map_txt(image_id, image, self.class_names, self.map_out_path)
+
+                # ------------------------------#
+                #   获得真实框txt
+                # ------------------------------#
+                with open(os.path.join(self.map_out_path, "ground-truth/" + image_id + ".txt"), "w") as new_f:
+                    for box in gt_boxes:
+                        left, top, right, bottom, obj = box
+                        obj_name = self.class_names[obj]
+                        new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
+
+            print("Calculate Map.")
+            try:
+                temp_map = get_coco_map(class_names=self.class_names, path=self.map_out_path)[1]
+            except:
+                temp_map = get_map(self.MINOVERLAP, False, path=self.map_out_path)
+            self.maps.append(temp_map)
+            self.epoches.append(epoch)
+
+            with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
+                f.write(str(temp_map))
+                f.write("\n")
+
+            plt.figure()
+            plt.plot(self.epoches, self.maps, 'red', linewidth=2, label='train map')
+
+            plt.grid(True)
+            plt.xlabel('Epoch')
+            plt.ylabel('Map %s' % str(self.MINOVERLAP))
+            plt.title('A Map Curve')
+            plt.legend(loc="upper right")
+
+            plt.savefig(os.path.join(self.log_dir, "epoch_map.png"))
+            plt.cla()
+            plt.close("all")
+
+            print("Get map done.")
+            shutil.rmtree(self.map_out_path)
--- a/utils/dataloader.py
+++ b/utils/dataloader.py
@ -0,0 +1,170 @@
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data.dataset import Dataset
+
+from utils.utils import cvtColor, preprocess_input
+
+
+class YoloDataset(Dataset):
+    def __init__(self, annotation_lines, input_shape, num_classes, train):
+        super(YoloDataset, self).__init__()
+        self.annotation_lines = annotation_lines  # 记录训练集或测试集的文件的路径，这个是可以全部载入的
+        self.input_shape = input_shape  # 这里是  [416, 416]
+        self.num_classes = num_classes  # 这里是20
+        self.length = len(self.annotation_lines)  # 数据的数量
+        self.train = train  # 是否是训练集的标记
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        index = index % self.length
+        # ---------------------------------------------------#
+        #   训练时进行数据的随机增强
+        #   验证时不进行数据的随机增强
+        # ---------------------------------------------------#
+        image, box = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2],
+                                          random=self.train)  # 自定义的数据增强
+        image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))  # 像素值归到0~1之间，然后变换坐标轴
+        box = np.array(box, dtype=np.float32)  # 转为numpy。np中常用的是创建新类型的array。
+        if len(box) != 0:
+            box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]  # 把框的坐标归一化
+            box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
+
+            box[:, 2:4] = box[:, 2:4] - box[:, 0:2]  # box第0，1维记录中心点 box第2，3维记录宽高
+            box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2  # box第0，1维记录中心点
+        return image, box
+
+    def rand(self, a=0, b=1):
+        return np.random.rand() * (b - a) + a
+
+    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
+        line = annotation_line.split()  # 以空格、回车等分隔字符串
+        # ------------------------------#
+        #   读取图像并转换成RGB图像
+        # ------------------------------#
+        image = Image.open(line[0])  # line[0] 是图片的地址
+        image = cvtColor(image)  # 这里啥也没干
+        # ------------------------------#
+        #   获得图像的高宽与目标高宽
+        # ------------------------------#
+        iw, ih = image.size  # 获取图像的原始尺寸
+        h, w = input_shape
+        # ------------------------------#
+        #   获得预测框
+        # ------------------------------#
+        box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])  # 从python二维矩阵转到 numpy二维矩阵
+
+        if not random:  # 没进入这里面
+            scale = min(w / iw, h / ih)
+            nw = int(iw * scale)
+            nh = int(ih * scale)
+            dx = (w - nw) // 2
+            dy = (h - nh) // 2
+
+            # ---------------------------------#
+            #   将图像多余的部分加上灰条
+            # ---------------------------------#
+            image = image.resize((nw, nh), Image.BICUBIC)
+            new_image = Image.new('RGB', (w, h), (128, 128, 128))
+            new_image.paste(image, (dx, dy))
+            image_data = np.array(new_image, np.float32)
+
+            # ---------------------------------#
+            #   对真实框进行调整
+            # ---------------------------------#
+            if len(box) > 0:
+                np.random.shuffle(box)
+                box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
+                box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
+                box[:, 0:2][box[:, 0:2] < 0] = 0
+                box[:, 2][box[:, 2] > w] = w
+                box[:, 3][box[:, 3] > h] = h
+                box_w = box[:, 2] - box[:, 0]
+                box_h = box[:, 3] - box[:, 1]
+                box = box[np.logical_and(box_w > 1, box_h > 1)]  # discard invalid box
+
+            return image_data, box
+
+        # ------------------------------------------#
+        #   对原始图像进行缩放并且进行长和宽的扭曲
+        # ------------------------------------------#
+        new_ar = iw / ih * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter)  # (iw*随机) / (ih*随机)
+        scale = self.rand(.25, 2)  # 随机一个缩放比例
+        if new_ar < 1:  # 原图高大
+            nh = int(scale * h)  # 新图先缩放高
+            nw = int(nh * new_ar)
+        else:  # 原图宽大
+            nw = int(scale * w)  # 新的宽从  预期宽中 乘以随机的比例
+            nh = int(nw / new_ar)  # 新的宽、高比，也是 new_ar, 也就是也是宽大
+        image = image.resize((nw, nh), Image.BICUBIC)
+
+        # ------------------------------------------#
+        #   将图像多余的部分加上灰条
+        # ------------------------------------------#
+        dx = int(self.rand(0, w - nw))  # 在(0, w - nw)找一个点作为新图的放置点
+        dy = int(self.rand(0, h - nh))
+        new_image = Image.new('RGB', (w, h), (128, 128, 128))   # 画一个 412, 412大小的灰图
+        new_image.paste(image, (dx, dy))  # 在这里看看两者的区别
+        image = new_image
+
+        # ------------------------------------------#
+        #   翻转图像
+        # ------------------------------------------#
+        flip = self.rand() < .5
+        if flip:
+            image = image.transpose(Image.FLIP_LEFT_RIGHT)
+
+        image_data = np.array(image, np.uint8)
+        # ---------------------------------#
+        #   对图像进行色域变换
+        #   计算色域变换的参数
+        # ---------------------------------#
+        r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
+        # ---------------------------------#
+        #   将图像转到HSV上
+        # ---------------------------------#
+        hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
+        dtype = image_data.dtype
+        # ---------------------------------#
+        #   应用变换
+        # ---------------------------------#
+        x = np.arange(0, 256, dtype=r.dtype)
+        lut_hue = ((x * r[0]) % 180).astype(dtype)
+        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+        # LUT是look-up table查找表的意思,cv2.LUT(src, lut, dst=None)的作用是对输入的src执行查找表lut转换
+        image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+        image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)   # image_data在这里还是unit8类型
+
+        # ---------------------------------#
+        #   对真实框进行调整
+        # ---------------------------------#
+        if len(box) > 0:  # 如果有box
+            np.random.shuffle(box)
+            box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx  # 所有行的第0列和2列，也就是 x 坐标， 除以iw找到占原图的比例，再乘以nw，是新图的比例，再加dx是新图中的偏移
+            box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
+            if flip:
+                box[:, [0, 2]] = w - box[:, [2, 0]]  # 如果有水平翻转，则x坐标变换为416-x，并且x0 和 x1的位置互换一下
+            box[:, 0:2][box[:, 0:2] < 0] = 0  # 对于左上角的点在图像外（小于0），则把对应的位置的坐标置为0  # 右下角的点不会小于0吗？
+            box[:, 2][box[:, 2] > w] = w  # 对于右下角的横坐标点超出图的，则置为w   # 右下角不会超出图吗？
+            box[:, 3][box[:, 3] > h] = h  # 对于右下角的纵坐标点超出图的，则置为h
+            box_w = box[:, 2] - box[:, 0]
+            box_h = box[:, 3] - box[:, 1]
+            box = box[np.logical_and(box_w > 1, box_h > 1)]  # 多余的检查？如果宽、高大于至少1，则保留下来
+
+        return image_data, box  # box依然是左上角和右下角的形式
+
+
+# DataLoader中collate_fn使用
+def yolo_dataset_collate(batch):
+    images = []  # 这是是一个batch大小的列表，每一项是  image_data, box。需要把image放一堆，box放一堆
+    bboxes = []
+    for img, box in batch:
+        images.append(img)  # images在这里已经是0~1的float32类型了
+        bboxes.append(box)
+    images = torch.from_numpy(np.array(images)).type(torch.FloatTensor)  # 转换为  batch_size, C, H, W  的数据
+    bboxes = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in bboxes]  # 转换为一个列表，每个元素是一组二维Tensor
+    return images, bboxes
--- a/utils/utils.py
+++ b/utils/utils.py
@ -0,0 +1,79 @@
+import numpy as np
+from PIL import Image
+
+
+# ---------------------------------------------------------#
+#   将图像转换成RGB图像，防止灰度图在预测时报错。
+#   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+# ---------------------------------------------------------#
+def cvtColor(image):
+    if len(np.shape(image)) == 3 and np.shape(image)[2] == 3:
+        return image
+    else:
+        image = image.convert('RGB')
+        return image
+
+    # ---------------------------------------------------#
+
+
+#   对输入图像进行resize
+# ---------------------------------------------------#
+def resize_image(image, size, letterbox_image):
+    iw, ih = image.size
+    w, h = size
+    if letterbox_image:
+        scale = min(w / iw, h / ih)
+        nw = int(iw * scale)
+        nh = int(ih * scale)
+
+        image = image.resize((nw, nh), Image.BICUBIC)
+        new_image = Image.new('RGB', size, (128, 128, 128))
+        new_image.paste(image, ((w - nw) // 2, (h - nh) // 2))
+    else:
+        new_image = image.resize((w, h), Image.BICUBIC)  # 这里直接用了缩放，而不是加灰条的形式
+    return new_image
+
+
+# ---------------------------------------------------#
+#   获得类
+# ---------------------------------------------------#
+def get_classes(classes_path):
+    with open(classes_path, encoding='utf-8') as f:
+        class_names = f.readlines()
+    class_names = [c.strip() for c in class_names]
+    return class_names, len(class_names)
+
+
+# ---------------------------------------------------#
+#   获得先验框
+# ---------------------------------------------------#
+def get_anchors(anchors_path):
+    '''loads the anchors from a file'''
+    with open(anchors_path, encoding='utf-8') as f:
+        anchors = f.readline()
+    anchors = [float(x) for x in anchors.split(',')]
+    anchors = np.array(anchors).reshape(-1, 2)
+    return anchors, len(anchors)
+
+
+# ---------------------------------------------------#
+#   获得学习率
+# ---------------------------------------------------#
+def get_lr(optimizer):
+    for param_group in optimizer.param_groups:
+        return param_group['lr']
+
+
+def preprocess_input(image):
+    image /= 255.0
+    return image
+
+
+def show_config(**kwargs):
+    print('Configurations:')
+    print('-' * 70)
+    print('|%25s | %40s|' % ('keys', 'values'))
+    print('-' * 70)
+    for key, value in kwargs.items():
+        print('|%25s | %40s|' % (str(key), str(value)))
+    print('-' * 70)
--- a/utils/utils_bbox.py
+++ b/utils/utils_bbox.py
@ -0,0 +1,232 @@
+import torch
+import torch.nn as nn
+from torchvision.ops import nms
+import numpy as np
+
+
+class DecodeBox():
+    def __init__(self, anchors, num_classes, input_shape, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]]):
+        super(DecodeBox, self).__init__()
+        self.anchors = anchors
+        self.num_classes = num_classes
+        self.bbox_attrs = 5 + num_classes
+        self.input_shape = input_shape
+        # -----------------------------------------------------------#
+        #   13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
+        #   26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
+        #   52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
+        # -----------------------------------------------------------#
+        self.anchors_mask = anchors_mask
+
+    def decode_box(self, inputs):
+        outputs = []
+        for i, input in enumerate(inputs):
+            # -----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 255, 13, 13
+            #   batch_size, 255, 26, 26
+            #   batch_size, 255, 52, 52
+            # -----------------------------------------------#
+            batch_size = input.size(0)
+            input_height = input.size(2)
+            input_width = input.size(3)
+
+            # -----------------------------------------------#
+            #   输入为416x416时
+            #   stride_h = stride_w = 32、16、8
+            # -----------------------------------------------#
+            stride_h = self.input_shape[0] / input_height
+            stride_w = self.input_shape[1] / input_width
+            # -------------------------------------------------#
+            #   此时获得的scaled_anchors大小是相对于特征层的
+            # -------------------------------------------------#
+            scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in
+                              self.anchors[self.anchors_mask[i]]]
+
+            # -----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 3, 13, 13, 85
+            #   batch_size, 3, 26, 26, 85
+            #   batch_size, 3, 52, 52, 85
+            # -----------------------------------------------#
+            prediction = input.view(batch_size, len(self.anchors_mask[i]),
+                                    self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
+            #  调整为 1，3，13，13，25 的形状
+            # -----------------------------------------------#
+            #   先验框的中心位置的调整参数
+            # -----------------------------------------------#
+            x = torch.sigmoid(prediction[..., 0])
+            y = torch.sigmoid(prediction[..., 1])
+            # -----------------------------------------------#
+            #   先验框的宽高调整参数
+            # -----------------------------------------------#
+            w = prediction[..., 2]
+            h = prediction[..., 3]
+            # -----------------------------------------------#
+            #   获得置信度，是否有物体
+            # -----------------------------------------------#
+            conf = torch.sigmoid(prediction[..., 4])
+            # -----------------------------------------------#
+            #   种类置信度
+            # -----------------------------------------------#
+            pred_cls = torch.sigmoid(prediction[..., 5:])
+
+            FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+            LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+
+            # ----------------------------------------------------------#
+            #   生成网格，先验框中心，网格左上角 
+            #   batch_size,3,13,13
+            # ----------------------------------------------------------#
+            grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)
+            grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)
+
+            # ----------------------------------------------------------#
+            #   按照网格格式生成先验框的宽高
+            #   batch_size,3,13,13
+            # ----------------------------------------------------------#
+            anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
+            anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
+            anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
+            anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
+
+            # ----------------------------------------------------------#
+            #   利用预测结果对先验框进行调整
+            #   首先调整先验框的中心，从先验框中心向右下角偏移  # ？从先验框左上角向右下角偏移？
+            #   再调整先验框的宽高。
+            # ----------------------------------------------------------#
+            pred_boxes = FloatTensor(prediction[..., :4].shape)
+            pred_boxes[..., 0] = x.data + grid_x
+            pred_boxes[..., 1] = y.data + grid_y
+            pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
+            pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
+
+            # ----------------------------------------------------------#
+            #   将输出结果归一化成小数的形式
+            # ----------------------------------------------------------#
+            _scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)
+            output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,
+                                conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
+            # output的shape是  batch_size, -1, attr(25)
+            outputs.append(output.data)
+        return outputs
+
+    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
+        # -----------------------------------------------------------------#
+        #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
+        # -----------------------------------------------------------------#
+        box_yx = box_xy[..., ::-1]
+        box_hw = box_wh[..., ::-1]
+        input_shape = np.array(input_shape)
+        image_shape = np.array(image_shape)
+
+        if letterbox_image:
+            # -----------------------------------------------------------------#
+            #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
+            #   new_shape指的是宽高缩放情况
+            # -----------------------------------------------------------------#
+            new_shape = np.round(image_shape * np.min(input_shape / image_shape))
+            offset = (input_shape - new_shape) / 2. / input_shape
+            scale = input_shape / new_shape
+
+            box_yx = (box_yx - offset) * scale
+            box_hw *= scale
+
+        box_mins = box_yx - (box_hw / 2.)
+        box_maxes = box_yx + (box_hw / 2.)
+        boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
+                               axis=-1)
+        boxes *= np.concatenate([image_shape, image_shape], axis=-1)
+        return boxes
+
+    def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5,
+                            nms_thres=0.4):
+        # ----------------------------------------------------------#
+        #   将预测结果的格式转换成左上角右下角的格式。
+        #   prediction  [batch_size, num_anchors, 85]
+        # ----------------------------------------------------------#
+        box_corner = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+        for i, image_pred in enumerate(prediction):
+            # ----------------------------------------------------------#
+            #   对种类预测部分取max。     # image_pred 是在prediction中以0维度迭代
+            #   class_conf  [num_anchors, 1]    种类置信度
+            #   class_pred  [num_anchors, 1]    种类    image_pred[:, 5:5 + num_classes]   是取出类别
+            # ----------------------------------------------------------#
+            class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+            # ----------------------------------------------------------#
+            #   利用置信度进行第一轮筛选
+            # ----------------------------------------------------------#
+            conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
+
+            # ----------------------------------------------------------#
+            #   根据置信度进行预测结果的筛选
+            # ----------------------------------------------------------#
+            image_pred = image_pred[conf_mask]
+            class_conf = class_conf[conf_mask]
+            class_pred = class_pred[conf_mask]
+            if not image_pred.size(0):
+                continue  # 如果没有剩下类别，就判断下一张图片
+            # -------------------------------------------------------------------------#
+            #   detections  [num_anchors, 7]
+            #   7的内容为：x1, y1, x2, y2, obj_conf, class_conf, class_pred
+            # -------------------------------------------------------------------------#
+            detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
+
+            # ------------------------------------------#
+            #   获得预测结果中包含的所有种类
+            # ------------------------------------------#
+            unique_labels = detections[:, -1].cpu().unique()
+
+            if prediction.is_cuda:
+                unique_labels = unique_labels.cuda()
+                detections = detections.cuda()
+
+            for c in unique_labels:
+                # ------------------------------------------#
+                #   获得某一类得分筛选后全部的预测结果
+                # ------------------------------------------#
+                detections_class = detections[detections[:, -1] == c]
+
+                # ------------------------------------------#
+                #   使用官方自带的非极大抑制会速度更快一些！
+                # ------------------------------------------#
+                keep = nms(
+                    detections_class[:, :4],
+                    detections_class[:, 4] * detections_class[:, 5],
+                    nms_thres
+                )
+                max_detections = detections_class[keep]
+
+                # # 按照存在物体的置信度排序
+                # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
+                # detections_class = detections_class[conf_sort_index]
+                # # 进行非极大抑制
+                # max_detections = []
+                # while detections_class.size(0):
+                #     # 取出这一类置信度最高的，一步一步往下判断，判断重合程度是否大于nms_thres，如果是则去除掉
+                #     max_detections.append(detections_class[0].unsqueeze(0))
+                #     if len(detections_class) == 1:
+                #         break
+                #     ious = bbox_iou(max_detections[-1], detections_class[1:])
+                #     detections_class = detections_class[1:][ious < nms_thres]
+                # # 堆叠
+                # max_detections = torch.cat(max_detections).data
+
+                # Add max detections to outputs
+                output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
+
+            if output[i] is not None:
+                output[i] = output[i].cpu().numpy()
+                box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4]) / 2, output[i][:, 2:4] - output[i][:, 0:2]
+                output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
+        return output
--- a/utils/utils_fit.py
+++ b/utils/utils_fit.py
@ -0,0 +1,151 @@
+import os
+
+import torch
+from tqdm import tqdm
+
+from utils.utils import get_lr
+
+
+def fit_one_epoch(model_train, model, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step,
+                  epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0):
+    loss = 0
+    val_loss = 0
+
+    if local_rank == 0:
+        print('Start Train')
+        pbar = tqdm(total=epoch_step, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3)
+    model_train.train()  # 调整所有的模块为train模式
+    for iteration, batch in enumerate(gen):
+        if iteration >= epoch_step:  # 有什么意义？
+            break
+
+        images, targets = batch[0], batch[1]   # targets也是归一化了的
+        with torch.no_grad():
+            if cuda:
+                images = images.cuda(local_rank)
+                targets = [ann.cuda(local_rank) for ann in
+                           targets]  # targets是一个python的list，里面是tensor，把tensor逐个转到cuda上，然后targets还是python的列表
+        # ----------------------#
+        #   清零梯度
+        # ----------------------#
+        optimizer.zero_grad()
+        if not fp16:
+            # ----------------------#
+            #   前向传播
+            # ----------------------#
+            outputs = model_train(images)
+
+            loss_value_all = 0
+            # ----------------------#
+            #   计算损失
+            # ----------------------#
+            for l in range(len(outputs)):  # 三组不同分辨率大小的输出特征分别计算
+                loss_item = yolo_loss(l, outputs[l], targets)
+                loss_value_all += loss_item
+            loss_value = loss_value_all
+
+            # ----------------------#
+            #   反向传播
+            # ----------------------#
+            loss_value.backward()
+            optimizer.step()
+        else:  # 不进入这条分支
+            from torch.cuda.amp import autocast
+            with autocast():
+                # ----------------------#
+                #   前向传播
+                # ----------------------#
+                outputs = model_train(images)
+
+                loss_value_all = 0
+                # ----------------------#
+                #   计算损失
+                # ----------------------#
+                for l in range(len(outputs)):
+                    loss_item = yolo_loss(l, outputs[l], targets)
+                    loss_value_all += loss_item
+                loss_value = loss_value_all
+
+            # ----------------------#
+            #   反向传播
+            # ----------------------#
+            scaler.scale(loss_value).backward()
+            scaler.step(optimizer)
+            scaler.update()
+
+        loss += loss_value.item()
+
+        # # 调试用 begin
+        # if iteration > 2:
+        #     break
+        # # 调试用 end
+
+        if local_rank == 0:
+            pbar.set_postfix(**{'loss': loss / (iteration + 1),
+                                'lr': get_lr(optimizer)})
+            pbar.update(1)
+
+    if local_rank == 0:
+        pbar.close()
+        print('Finish Train')
+        print('Start Validation')
+        pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3)
+
+    model_train.eval()
+    for iteration, batch in enumerate(gen_val):
+        if iteration >= epoch_step_val:
+            break
+        images, targets = batch[0], batch[1]
+        with torch.no_grad():
+            if cuda:
+                images = images.cuda(local_rank)
+                targets = [ann.cuda(local_rank) for ann in targets]
+            # ----------------------#
+            #   清零梯度
+            # ----------------------#
+            optimizer.zero_grad()
+            # ----------------------#
+            #   前向传播
+            # ----------------------#
+            outputs = model_train(images)
+
+            loss_value_all = 0
+            # ----------------------#
+            #   计算损失
+            # ----------------------#
+            for l in range(len(outputs)):
+                loss_item = yolo_loss(l, outputs[l], targets)
+                loss_value_all += loss_item
+            loss_value = loss_value_all
+
+        val_loss += loss_value.item()
+
+        # # 调试用 begin
+        # if iteration > 2:
+        #     break
+        # # 调试用 end
+
+        if local_rank == 0:
+            pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
+            pbar.update(1)
+
+    if local_rank == 0:
+        pbar.close()
+        print('Finish Validation')
+        loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val)
+        eval_callback.on_epoch_end(epoch + 1, model_train)
+        print('Epoch:' + str(epoch + 1) + '/' + str(Epoch))
+        print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
+
+        # -----------------------------------------------#
+        #   保存权值
+        # -----------------------------------------------#
+        if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch:
+            torch.save(model.state_dict(), os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (
+                epoch + 1, loss / epoch_step, val_loss / epoch_step_val)))
+
+        if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss):
+            print('Save best model to best_epoch_weights.pth')
+            torch.save(model.state_dict(), os.path.join(save_dir, "best_epoch_weights.pth"))
+
+        torch.save(model.state_dict(), os.path.join(save_dir, "last_epoch_weights.pth"))
--- a/utils/utils_map.py
+++ b/utils/utils_map.py
@ -0,0 +1,963 @@
+import glob
+import json
+import math
+import operator
+import os
+import shutil
+import sys
+
+try:
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+except:
+    pass
+import cv2
+import matplotlib
+
+matplotlib.use('Agg')
+from matplotlib import pyplot as plt
+import numpy as np
+
+'''
+    0,0 ------> x (width)
+     |
+     |  (Left,Top)
+     |      *_________
+     |      |         |
+            |         |
+     y      |_________|
+  (height)            *
+                (Right,Bottom)
+'''
+
+
+def log_average_miss_rate(precision, fp_cumsum, num_images):
+    """
+        log-average miss rate:
+            Calculated by averaging miss rates at 9 evenly spaced FPPI points
+            between 10e-2 and 10e0, in log-space.
+
+        output:
+                lamr | log-average miss rate
+                mr | miss rate
+                fppi | false positives per image
+
+        references:
+            [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
+               State of the Art." Pattern Analysis and Machine Intelligence, IEEE
+               Transactions on 34.4 (2012): 743 - 761.
+    """
+
+    if precision.size == 0:
+        lamr = 0
+        mr = 1
+        fppi = 0
+        return lamr, mr, fppi
+
+    fppi = fp_cumsum / float(num_images)
+    mr = (1 - precision)
+
+    fppi_tmp = np.insert(fppi, 0, -1.0)
+    mr_tmp = np.insert(mr, 0, 1.0)
+
+    ref = np.logspace(-2.0, 0.0, num=9)
+    for i, ref_i in enumerate(ref):
+        j = np.where(fppi_tmp <= ref_i)[-1][-1]
+        ref[i] = mr_tmp[j]
+
+    lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
+
+    return lamr, mr, fppi
+
+
+"""
+ throw error and exit
+"""
+
+
+def error(msg):
+    print(msg)
+    sys.exit(0)
+
+
+"""
+ check if the number is a float between 0.0 and 1.0
+"""
+
+
+def is_float_between_0_and_1(value):
+    try:
+        val = float(value)
+        if val > 0.0 and val < 1.0:
+            return True
+        else:
+            return False
+    except ValueError:
+        return False
+
+
+"""
+ Calculate the AP given the recall and precision array
+    1st) We compute a version of the measured precision/recall curve with
+         precision monotonically decreasing
+    2nd) We compute the AP as the area under this curve by numerical integration.
+"""
+
+
+def voc_ap(rec, prec):
+    """
+    --- Official matlab code VOC2012---
+    mrec=[0 ; rec ; 1];
+    mpre=[0 ; prec ; 0];
+    for i=numel(mpre)-1:-1:1
+            mpre(i)=max(mpre(i),mpre(i+1));
+    end
+    i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    rec.insert(0, 0.0)  # insert 0.0 at begining of list
+    rec.append(1.0)  # insert 1.0 at end of list
+    mrec = rec[:]
+    prec.insert(0, 0.0)  # insert 0.0 at begining of list
+    prec.append(0.0)  # insert 0.0 at end of list
+    mpre = prec[:]
+    """
+     This part makes the precision monotonically decreasing
+        (goes from the end to the beginning)
+        matlab: for i=numel(mpre)-1:-1:1
+                    mpre(i)=max(mpre(i),mpre(i+1));
+    """
+    for i in range(len(mpre) - 2, -1, -1):
+        mpre[i] = max(mpre[i], mpre[i + 1])
+    """
+     This part creates a list of indexes where the recall changes
+        matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    """
+    i_list = []
+    for i in range(1, len(mrec)):
+        if mrec[i] != mrec[i - 1]:
+            i_list.append(i)  # if it was matlab would be i + 1
+    """
+     The Average Precision (AP) is the area under the curve
+        (numerical integration)
+        matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    ap = 0.0
+    for i in i_list:
+        ap += ((mrec[i] - mrec[i - 1]) * mpre[i])
+    return ap, mrec, mpre
+
+
+"""
+ Convert the lines of a file to a list
+"""
+
+
+def file_lines_to_list(path):
+    # open txt file lines to a list
+    with open(path) as f:
+        content = f.readlines()
+    # remove whitespace characters like `\n` at the end of each line
+    content = [x.strip() for x in content]
+    return content
+
+
+"""
+ Draws text in image
+"""
+
+
+def draw_text_in_image(img, text, pos, color, line_width):
+    font = cv2.FONT_HERSHEY_PLAIN
+    fontScale = 1
+    lineType = 1
+    bottomLeftCornerOfText = pos
+    cv2.putText(img, text,
+                bottomLeftCornerOfText,
+                font,
+                fontScale,
+                color,
+                lineType)
+    text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
+    return img, (line_width + text_width)
+
+
+"""
+ Plot - adjust axes
+"""
+
+
+def adjust_axes(r, t, fig, axes):
+    # get text width for re-scaling
+    bb = t.get_window_extent(renderer=r)
+    text_width_inches = bb.width / fig.dpi
+    # get axis width in inches
+    current_fig_width = fig.get_figwidth()
+    new_fig_width = current_fig_width + text_width_inches
+    propotion = new_fig_width / current_fig_width
+    # get axis limit
+    x_lim = axes.get_xlim()
+    axes.set_xlim([x_lim[0], x_lim[1] * propotion])
+
+
+"""
+ Draw plot using Matplotlib
+"""
+
+
+def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color,
+                   true_p_bar):
+    # sort the dictionary by decreasing value, into a list of tuples
+    sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
+    # unpacking the list of tuples into two lists
+    sorted_keys, sorted_values = zip(*sorted_dic_by_value)
+    # 
+    if true_p_bar != "":
+        """
+         Special case to draw in:
+            - green -> TP: True Positives (object detected and matches ground-truth)
+            - red -> FP: False Positives (object detected but does not match ground-truth)
+            - orange -> FN: False Negatives (object not detected but present in the ground-truth)
+        """
+        fp_sorted = []
+        tp_sorted = []
+        for key in sorted_keys:
+            fp_sorted.append(dictionary[key] - true_p_bar[key])
+            tp_sorted.append(true_p_bar[key])
+        plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
+        plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive',
+                 left=fp_sorted)
+        # add legend
+        plt.legend(loc='lower right')
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf()  # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            fp_val = fp_sorted[i]
+            tp_val = tp_sorted[i]
+            fp_str_val = " " + str(fp_val)
+            tp_str_val = fp_str_val + " " + str(tp_val)
+            # trick to paint multicolor with offset:
+            # first paint everything and then repaint the first number
+            t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
+            plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
+            if i == (len(sorted_values) - 1):  # largest bar
+                adjust_axes(r, t, fig, axes)
+    else:
+        plt.barh(range(n_classes), sorted_values, color=plot_color)
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf()  # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            str_val = " " + str(val)  # add a space before
+            if val < 1.0:
+                str_val = " {0:.2f}".format(val)
+            t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
+            # re-set axes to show number inside the figure
+            if i == (len(sorted_values) - 1):  # largest bar
+                adjust_axes(r, t, fig, axes)
+    # set window title
+    fig.canvas.set_window_title(window_title)
+    # write classes in y axis
+    tick_font_size = 12
+    plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
+    """
+     Re-scale height accordingly
+    """
+    init_height = fig.get_figheight()
+    # comput the matrix height in points and inches
+    dpi = fig.dpi
+    height_pt = n_classes * (tick_font_size * 1.4)  # 1.4 (some spacing)
+    height_in = height_pt / dpi
+    # compute the required figure height 
+    top_margin = 0.15  # in percentage of the figure height
+    bottom_margin = 0.05  # in percentage of the figure height
+    figure_height = height_in / (1 - top_margin - bottom_margin)
+    # set new height
+    if figure_height > init_height:
+        fig.set_figheight(figure_height)
+
+    # set plot title
+    plt.title(plot_title, fontsize=14)
+    # set axis titles
+    # plt.xlabel('classes')
+    plt.xlabel(x_label, fontsize='large')
+    # adjust size of window
+    fig.tight_layout()
+    # save the plot
+    fig.savefig(output_path)
+    # show image
+    if to_show:
+        plt.show()
+    # close the plot
+    plt.close()
+
+
+def get_map(MINOVERLAP, draw_plot, score_threhold=0.5, path='./map_out'):
+    GT_PATH = os.path.join(path, 'ground-truth')
+    DR_PATH = os.path.join(path, 'detection-results')
+    IMG_PATH = os.path.join(path, 'images-optional')
+    TEMP_FILES_PATH = os.path.join(path, '.temp_files')
+    RESULTS_FILES_PATH = os.path.join(path, 'results')
+
+    show_animation = True
+    if os.path.exists(IMG_PATH):
+        for dirpath, dirnames, files in os.walk(IMG_PATH):
+            if not files:
+                show_animation = False
+    else:
+        show_animation = False
+
+    if not os.path.exists(TEMP_FILES_PATH):
+        os.makedirs(TEMP_FILES_PATH)
+
+    if os.path.exists(RESULTS_FILES_PATH):
+        shutil.rmtree(RESULTS_FILES_PATH)
+    else:
+        os.makedirs(RESULTS_FILES_PATH)
+    if draw_plot:
+        try:
+            matplotlib.use('TkAgg')
+        except:
+            pass
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall"))
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision"))
+    if show_animation:
+        os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one"))
+
+    ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
+    if len(ground_truth_files_list) == 0:
+        error("Error: No ground-truth files found!")
+    ground_truth_files_list.sort()
+    gt_counter_per_class = {}
+    counter_images_per_class = {}
+
+    for txt_file in ground_truth_files_list:
+        file_id = txt_file.split(".txt", 1)[0]
+        file_id = os.path.basename(os.path.normpath(file_id))
+        temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
+        if not os.path.exists(temp_path):
+            error_msg = "Error. File not found: {}\n".format(temp_path)
+            error(error_msg)
+        lines_list = file_lines_to_list(txt_file)
+        bounding_boxes = []
+        is_difficult = False
+        already_seen_classes = []
+        for line in lines_list:
+            try:
+                if "difficult" in line:
+                    class_name, left, top, right, bottom, _difficult = line.split()
+                    is_difficult = True
+                else:
+                    class_name, left, top, right, bottom = line.split()
+            except:
+                if "difficult" in line:
+                    line_split = line.split()
+                    _difficult = line_split[-1]
+                    bottom = line_split[-2]
+                    right = line_split[-3]
+                    top = line_split[-4]
+                    left = line_split[-5]
+                    class_name = ""
+                    for name in line_split[:-5]:
+                        class_name += name + " "
+                    class_name = class_name[:-1]
+                    is_difficult = True
+                else:
+                    line_split = line.split()
+                    bottom = line_split[-1]
+                    right = line_split[-2]
+                    top = line_split[-3]
+                    left = line_split[-4]
+                    class_name = ""
+                    for name in line_split[:-4]:
+                        class_name += name + " "
+                    class_name = class_name[:-1]
+
+            bbox = left + " " + top + " " + right + " " + bottom
+            if is_difficult:
+                bounding_boxes.append({"class_name": class_name, "bbox": bbox, "used": False, "difficult": True})
+                is_difficult = False
+            else:
+                bounding_boxes.append({"class_name": class_name, "bbox": bbox, "used": False})
+                if class_name in gt_counter_per_class:
+                    gt_counter_per_class[class_name] += 1
+                else:
+                    gt_counter_per_class[class_name] = 1
+
+                if class_name not in already_seen_classes:
+                    if class_name in counter_images_per_class:
+                        counter_images_per_class[class_name] += 1
+                    else:
+                        counter_images_per_class[class_name] = 1
+                    already_seen_classes.append(class_name)
+
+        with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile:
+            json.dump(bounding_boxes, outfile)
+
+    gt_classes = list(gt_counter_per_class.keys())
+    gt_classes = sorted(gt_classes)
+    n_classes = len(gt_classes)
+
+    dr_files_list = glob.glob(DR_PATH + '/*.txt')
+    dr_files_list.sort()
+    for class_index, class_name in enumerate(gt_classes):
+        bounding_boxes = []
+        for txt_file in dr_files_list:
+            file_id = txt_file.split(".txt", 1)[0]
+            file_id = os.path.basename(os.path.normpath(file_id))
+            temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
+            if class_index == 0:
+                if not os.path.exists(temp_path):
+                    error_msg = "Error. File not found: {}\n".format(temp_path)
+                    error(error_msg)
+            lines = file_lines_to_list(txt_file)
+            for line in lines:
+                try:
+                    tmp_class_name, confidence, left, top, right, bottom = line.split()
+                except:
+                    line_split = line.split()
+                    bottom = line_split[-1]
+                    right = line_split[-2]
+                    top = line_split[-3]
+                    left = line_split[-4]
+                    confidence = line_split[-5]
+                    tmp_class_name = ""
+                    for name in line_split[:-5]:
+                        tmp_class_name += name + " "
+                    tmp_class_name = tmp_class_name[:-1]
+
+                if tmp_class_name == class_name:
+                    bbox = left + " " + top + " " + right + " " + bottom
+                    bounding_boxes.append({"confidence": confidence, "file_id": file_id, "bbox": bbox})
+
+        bounding_boxes.sort(key=lambda x: float(x['confidence']), reverse=True)
+        with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
+            json.dump(bounding_boxes, outfile)
+
+    sum_AP = 0.0
+    ap_dictionary = {}
+    lamr_dictionary = {}
+    with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file:
+        results_file.write("# AP and precision/recall per class\n")
+        count_true_positives = {}
+
+        for class_index, class_name in enumerate(gt_classes):
+            count_true_positives[class_name] = 0
+            dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
+            dr_data = json.load(open(dr_file))
+
+            nd = len(dr_data)
+            tp = [0] * nd
+            fp = [0] * nd
+            score = [0] * nd
+            score_threhold_idx = 0
+            for idx, detection in enumerate(dr_data):
+                file_id = detection["file_id"]
+                score[idx] = float(detection["confidence"])
+                if score[idx] >= score_threhold:
+                    score_threhold_idx = idx
+
+                if show_animation:
+                    ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
+                    if len(ground_truth_img) == 0:
+                        error("Error. Image not found with id: " + file_id)
+                    elif len(ground_truth_img) > 1:
+                        error("Error. Multiple image with id: " + file_id)
+                    else:
+                        img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
+                        img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0]
+                        if os.path.isfile(img_cumulative_path):
+                            img_cumulative = cv2.imread(img_cumulative_path)
+                        else:
+                            img_cumulative = img.copy()
+                        bottom_border = 60
+                        BLACK = [0, 0, 0]
+                        img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
+
+                gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
+                ground_truth_data = json.load(open(gt_file))
+                ovmax = -1
+                gt_match = -1
+                bb = [float(x) for x in detection["bbox"].split()]
+                for obj in ground_truth_data:
+                    if obj["class_name"] == class_name:
+                        bbgt = [float(x) for x in obj["bbox"].split()]
+                        bi = [max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), min(bb[3], bbgt[3])]
+                        iw = bi[2] - bi[0] + 1
+                        ih = bi[3] - bi[1] + 1
+                        if iw > 0 and ih > 0:
+                            ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
+                                                                              + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
+                            ov = iw * ih / ua
+                            if ov > ovmax:
+                                ovmax = ov
+                                gt_match = obj
+
+                if show_animation:
+                    status = "NO MATCH FOUND!"
+
+                min_overlap = MINOVERLAP
+                if ovmax >= min_overlap:
+                    if "difficult" not in gt_match:
+                        if not bool(gt_match["used"]):
+                            tp[idx] = 1
+                            gt_match["used"] = True
+                            count_true_positives[class_name] += 1
+                            with open(gt_file, 'w') as f:
+                                f.write(json.dumps(ground_truth_data))
+                            if show_animation:
+                                status = "MATCH!"
+                        else:
+                            fp[idx] = 1
+                            if show_animation:
+                                status = "REPEATED MATCH!"
+                else:
+                    fp[idx] = 1
+                    if ovmax > 0:
+                        status = "INSUFFICIENT OVERLAP"
+
+                """
+                Draw image to show animation
+                """
+                if show_animation:
+                    height, widht = img.shape[:2]
+                    white = (255, 255, 255)
+                    light_blue = (255, 200, 100)
+                    green = (0, 255, 0)
+                    light_red = (30, 30, 255)
+                    margin = 10
+                    # 1nd line
+                    v_pos = int(height - margin - (bottom_border / 2.0))
+                    text = "Image: " + ground_truth_img[0] + " "
+                    img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                    text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
+                    img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue,
+                                                         line_width)
+                    if ovmax != -1:
+                        color = light_red
+                        if status == "INSUFFICIENT OVERLAP":
+                            text = "IoU: {0:.2f}% ".format(ovmax * 100) + "< {0:.2f}% ".format(min_overlap * 100)
+                        else:
+                            text = "IoU: {0:.2f}% ".format(ovmax * 100) + ">= {0:.2f}% ".format(min_overlap * 100)
+                            color = green
+                        img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+                    # 2nd line
+                    v_pos += int(bottom_border / 2.0)
+                    rank_pos = str(idx + 1)
+                    text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(
+                        float(detection["confidence"]) * 100)
+                    img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                    color = light_red
+                    if status == "MATCH!":
+                        color = green
+                    text = "Result: " + status + " "
+                    img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    if ovmax > 0:
+                        bbgt = [int(round(float(x))) for x in gt_match["bbox"].split()]
+                        cv2.rectangle(img, (bbgt[0], bbgt[1]), (bbgt[2], bbgt[3]), light_blue, 2)
+                        cv2.rectangle(img_cumulative, (bbgt[0], bbgt[1]), (bbgt[2], bbgt[3]), light_blue, 2)
+                        cv2.putText(img_cumulative, class_name, (bbgt[0], bbgt[1] - 5), font, 0.6, light_blue, 1,
+                                    cv2.LINE_AA)
+                    bb = [int(i) for i in bb]
+                    cv2.rectangle(img, (bb[0], bb[1]), (bb[2], bb[3]), color, 2)
+                    cv2.rectangle(img_cumulative, (bb[0], bb[1]), (bb[2], bb[3]), color, 2)
+                    cv2.putText(img_cumulative, class_name, (bb[0], bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
+
+                    cv2.imshow("Animation", img)
+                    cv2.waitKey(20)
+                    output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(
+                        idx) + ".jpg"
+                    cv2.imwrite(output_img_path, img)
+                    cv2.imwrite(img_cumulative_path, img_cumulative)
+
+            cumsum = 0
+            for idx, val in enumerate(fp):
+                fp[idx] += cumsum
+                cumsum += val
+
+            cumsum = 0
+            for idx, val in enumerate(tp):
+                tp[idx] += cumsum
+                cumsum += val
+
+            rec = tp[:]
+            for idx, val in enumerate(tp):
+                rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1)
+
+            prec = tp[:]
+            for idx, val in enumerate(tp):
+                prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1)
+
+            ap, mrec, mprec = voc_ap(rec[:], prec[:])
+            F1 = np.array(rec) * np.array(prec) * 2 / np.where((np.array(prec) + np.array(rec)) == 0, 1,
+                                                               (np.array(prec) + np.array(rec)))
+
+            sum_AP += ap
+            text = "{0:.2f}%".format(
+                ap * 100) + " = " + class_name + " AP "  # class_name + " AP = {0:.2f}%".format(ap*100)
+
+            if len(prec) > 0:
+                F1_text = "{0:.2f}".format(F1[score_threhold_idx]) + " = " + class_name + " F1 "
+                Recall_text = "{0:.2f}%".format(rec[score_threhold_idx] * 100) + " = " + class_name + " Recall "
+                Precision_text = "{0:.2f}%".format(prec[score_threhold_idx] * 100) + " = " + class_name + " Precision "
+            else:
+                F1_text = "0.00" + " = " + class_name + " F1 "
+                Recall_text = "0.00%" + " = " + class_name + " Recall "
+                Precision_text = "0.00%" + " = " + class_name + " Precision "
+
+            rounded_prec = ['%.2f' % elem for elem in prec]
+            rounded_rec = ['%.2f' % elem for elem in rec]
+            results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
+
+            if len(prec) > 0:
+                print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=" + "{0:.2f}".format(
+                    F1[score_threhold_idx]) \
+                      + " ; Recall=" + "{0:.2f}%".format(
+                    rec[score_threhold_idx] * 100) + " ; Precision=" + "{0:.2f}%".format(
+                    prec[score_threhold_idx] * 100))
+            else:
+                print(text + "\t||\tscore_threhold=" + str(
+                    score_threhold) + " : " + "F1=0.00% ; Recall=0.00% ; Precision=0.00%")
+            ap_dictionary[class_name] = ap
+
+            n_images = counter_images_per_class[class_name]
+            lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images)
+            lamr_dictionary[class_name] = lamr
+
+            if draw_plot:
+                plt.plot(rec, prec, '-o')
+                area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
+                area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
+                plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
+
+                fig = plt.gcf()
+                fig.canvas.set_window_title('AP ' + class_name)
+
+                plt.title('class: ' + text)
+                plt.xlabel('Recall')
+                plt.ylabel('Precision')
+                axes = plt.gca()
+                axes.set_xlim([0.0, 1.0])
+                axes.set_ylim([0.0, 1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, F1, "-", color='orangered')
+                plt.title('class: ' + F1_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('F1')
+                axes = plt.gca()
+                axes.set_xlim([0.0, 1.0])
+                axes.set_ylim([0.0, 1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, rec, "-H", color='gold')
+                plt.title('class: ' + Recall_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('Recall')
+                axes = plt.gca()
+                axes.set_xlim([0.0, 1.0])
+                axes.set_ylim([0.0, 1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png")
+                plt.cla()
+
+                plt.plot(score, prec, "-s", color='palevioletred')
+                plt.title('class: ' + Precision_text + "\nscore_threhold=" + str(score_threhold))
+                plt.xlabel('Score_Threhold')
+                plt.ylabel('Precision')
+                axes = plt.gca()
+                axes.set_xlim([0.0, 1.0])
+                axes.set_ylim([0.0, 1.05])
+                fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png")
+                plt.cla()
+
+        if show_animation:
+            cv2.destroyAllWindows()
+        if n_classes == 0:
+            print("未检测到任何种类，请检查标签信息与get_map.py中的classes_path是否修改。")
+            return 0
+        results_file.write("\n# mAP of all classes\n")
+        mAP = sum_AP / n_classes
+        text = "mAP = {0:.2f}%".format(mAP * 100)
+        results_file.write(text + "\n")
+        print(text)
+
+    shutil.rmtree(TEMP_FILES_PATH)
+
+    """
+    Count total of detection-results
+    """
+    det_counter_per_class = {}
+    for txt_file in dr_files_list:
+        lines_list = file_lines_to_list(txt_file)
+        for line in lines_list:
+            class_name = line.split()[0]
+            if class_name in det_counter_per_class:
+                det_counter_per_class[class_name] += 1
+            else:
+                det_counter_per_class[class_name] = 1
+    dr_classes = list(det_counter_per_class.keys())
+
+    """
+    Write number of ground-truth objects per class to results.txt
+    """
+    with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
+        results_file.write("\n# Number of ground-truth objects per class\n")
+        for class_name in sorted(gt_counter_per_class):
+            results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
+
+    """
+    Finish counting true positives
+    """
+    for class_name in dr_classes:
+        if class_name not in gt_classes:
+            count_true_positives[class_name] = 0
+
+    """
+    Write number of detected objects per class to results.txt
+    """
+    with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
+        results_file.write("\n# Number of detected objects per class\n")
+        for class_name in sorted(dr_classes):
+            n_det = det_counter_per_class[class_name]
+            text = class_name + ": " + str(n_det)
+            text += " (tp:" + str(count_true_positives[class_name]) + ""
+            text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
+            results_file.write(text)
+
+    """
+    Plot the total number of occurences of each class in the ground-truth
+    """
+    if draw_plot:
+        window_title = "ground-truth-info"
+        plot_title = "ground-truth\n"
+        plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
+        x_label = "Number of objects per class"
+        output_path = RESULTS_FILES_PATH + "/ground-truth-info.png"
+        to_show = False
+        plot_color = 'forestgreen'
+        draw_plot_func(
+            gt_counter_per_class,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            '',
+        )
+
+    # """
+    # Plot the total number of occurences of each class in the "detection-results" folder
+    # """
+    # if draw_plot:
+    #     window_title = "detection-results-info"
+    #     # Plot title
+    #     plot_title = "detection-results\n"
+    #     plot_title += "(" + str(len(dr_files_list)) + " files and "
+    #     count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
+    #     plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
+    #     # end Plot title
+    #     x_label = "Number of objects per class"
+    #     output_path = RESULTS_FILES_PATH + "/detection-results-info.png"
+    #     to_show = False
+    #     plot_color = 'forestgreen'
+    #     true_p_bar = count_true_positives
+    #     draw_plot_func(
+    #         det_counter_per_class,
+    #         len(det_counter_per_class),
+    #         window_title,
+    #         plot_title,
+    #         x_label,
+    #         output_path,
+    #         to_show,
+    #         plot_color,
+    #         true_p_bar
+    #         )
+
+    """
+    Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
+    """
+    if draw_plot:
+        window_title = "lamr"
+        plot_title = "log-average miss rate"
+        x_label = "log-average miss rate"
+        output_path = RESULTS_FILES_PATH + "/lamr.png"
+        to_show = False
+        plot_color = 'royalblue'
+        draw_plot_func(
+            lamr_dictionary,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            ""
+        )
+
+    """
+    Draw mAP plot (Show AP's of all classes in decreasing order)
+    """
+    if draw_plot:
+        window_title = "mAP"
+        plot_title = "mAP = {0:.2f}%".format(mAP * 100)
+        x_label = "Average Precision"
+        output_path = RESULTS_FILES_PATH + "/mAP.png"
+        to_show = True
+        plot_color = 'royalblue'
+        draw_plot_func(
+            ap_dictionary,
+            n_classes,
+            window_title,
+            plot_title,
+            x_label,
+            output_path,
+            to_show,
+            plot_color,
+            ""
+        )
+    return mAP
+
+
+def preprocess_gt(gt_path, class_names):
+    image_ids = os.listdir(gt_path)
+    results = {}
+
+    images = []
+    bboxes = []
+    for i, image_id in enumerate(image_ids):
+        lines_list = file_lines_to_list(os.path.join(gt_path, image_id))
+        boxes_per_image = []
+        image = {}
+        image_id = os.path.splitext(image_id)[0]
+        image['file_name'] = image_id + '.jpg'
+        image['width'] = 1
+        image['height'] = 1
+        # -----------------------------------------------------------------#
+        #   感谢 多学学英语吧 的提醒
+        #   解决了'Results do not correspond to current coco set'问题
+        # -----------------------------------------------------------------#
+        image['id'] = str(image_id)
+
+        for line in lines_list:
+            difficult = 0
+            if "difficult" in line:
+                line_split = line.split()
+                left, top, right, bottom, _difficult = line_split[-5:]
+                class_name = ""
+                for name in line_split[:-5]:
+                    class_name += name + " "
+                class_name = class_name[:-1]
+                difficult = 1
+            else:
+                line_split = line.split()
+                left, top, right, bottom = line_split[-4:]
+                class_name = ""
+                for name in line_split[:-4]:
+                    class_name += name + " "
+                class_name = class_name[:-1]
+
+            left, top, right, bottom = float(left), float(top), float(right), float(bottom)
+            if class_name not in class_names:
+                continue
+            cls_id = class_names.index(class_name) + 1
+            bbox = [left, top, right - left, bottom - top, difficult, str(image_id), cls_id,
+                    (right - left) * (bottom - top) - 10.0]
+            boxes_per_image.append(bbox)
+        images.append(image)
+        bboxes.extend(boxes_per_image)
+    results['images'] = images
+
+    categories = []
+    for i, cls in enumerate(class_names):
+        category = {}
+        category['supercategory'] = cls
+        category['name'] = cls
+        category['id'] = i + 1
+        categories.append(category)
+    results['categories'] = categories
+
+    annotations = []
+    for i, box in enumerate(bboxes):
+        annotation = {}
+        annotation['area'] = box[-1]
+        annotation['category_id'] = box[-2]
+        annotation['image_id'] = box[-3]
+        annotation['iscrowd'] = box[-4]
+        annotation['bbox'] = box[:4]
+        annotation['id'] = i
+        annotations.append(annotation)
+    results['annotations'] = annotations
+    return results
+
+
+def preprocess_dr(dr_path, class_names):
+    image_ids = os.listdir(dr_path)
+    results = []
+    for image_id in image_ids:
+        lines_list = file_lines_to_list(os.path.join(dr_path, image_id))
+        image_id = os.path.splitext(image_id)[0]
+        for line in lines_list:
+            line_split = line.split()
+            confidence, left, top, right, bottom = line_split[-5:]
+            class_name = ""
+            for name in line_split[:-5]:
+                class_name += name + " "
+            class_name = class_name[:-1]
+            left, top, right, bottom = float(left), float(top), float(right), float(bottom)
+            result = {}
+            result["image_id"] = str(image_id)
+            if class_name not in class_names:
+                continue
+            result["category_id"] = class_names.index(class_name) + 1
+            result["bbox"] = [left, top, right - left, bottom - top]
+            result["score"] = float(confidence)
+            results.append(result)
+    return results
+
+
+def get_coco_map(class_names, path):
+    GT_PATH = os.path.join(path, 'ground-truth')
+    DR_PATH = os.path.join(path, 'detection-results')
+    COCO_PATH = os.path.join(path, 'coco_eval')
+
+    if not os.path.exists(COCO_PATH):
+        os.makedirs(COCO_PATH)
+
+    GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json')
+    DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json')
+
+    with open(GT_JSON_PATH, "w") as f:
+        results_gt = preprocess_gt(GT_PATH, class_names)
+        json.dump(results_gt, f, indent=4)
+
+    with open(DR_JSON_PATH, "w") as f:
+        results_dr = preprocess_dr(DR_PATH, class_names)
+        json.dump(results_dr, f, indent=4)
+        if len(results_dr) == 0:
+            print("未检测到任何目标。")
+            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+    cocoGt = COCO(GT_JSON_PATH)
+    cocoDt = cocoGt.loadRes(DR_JSON_PATH)
+    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
+
+    return cocoEval.stats
--- a/utils_coco/coco_annotation.py
+++ b/utils_coco/coco_annotation.py
@ -0,0 +1,117 @@
+# -------------------------------------------------------#
+#   用于处理COCO数据集，根据json文件生成txt文件用于训练
+# -------------------------------------------------------#
+import json
+import os
+from collections import defaultdict
+
+# -------------------------------------------------------#
+#   指向了COCO训练集与验证集图片的路径
+# -------------------------------------------------------#
+train_datasets_path = "coco_dataset/train2017"
+val_datasets_path = "coco_dataset/val2017"
+
+# -------------------------------------------------------#
+#   指向了COCO训练集与验证集标签的路径
+# -------------------------------------------------------#
+train_annotation_path = "coco_dataset/annotations/instances_train2017.json"
+val_annotation_path = "coco_dataset/annotations/instances_val2017.json"
+
+# -------------------------------------------------------#
+#   生成的txt文件路径
+# -------------------------------------------------------#
+train_output_path = "coco_train.txt"
+val_output_path = "coco_val.txt"
+
+if __name__ == "__main__":
+    name_box_id = defaultdict(list)
+    id_name = dict()
+    f = open(train_annotation_path, encoding='utf-8')
+    data = json.load(f)
+
+    annotations = data['annotations']
+    for ant in annotations:
+        id = ant['image_id']
+        name = os.path.join(train_datasets_path, '%012d.jpg' % id)
+        cat = ant['category_id']
+        if cat >= 1 and cat <= 11:
+            cat = cat - 1
+        elif cat >= 13 and cat <= 25:
+            cat = cat - 2
+        elif cat >= 27 and cat <= 28:
+            cat = cat - 3
+        elif cat >= 31 and cat <= 44:
+            cat = cat - 5
+        elif cat >= 46 and cat <= 65:
+            cat = cat - 6
+        elif cat == 67:
+            cat = cat - 7
+        elif cat == 70:
+            cat = cat - 9
+        elif cat >= 72 and cat <= 82:
+            cat = cat - 10
+        elif cat >= 84 and cat <= 90:
+            cat = cat - 11
+        name_box_id[name].append([ant['bbox'], cat])
+
+    f = open(train_output_path, 'w')
+    for key in name_box_id.keys():
+        f.write(key)
+        box_infos = name_box_id[key]
+        for info in box_infos:
+            x_min = int(info[0][0])
+            y_min = int(info[0][1])
+            x_max = x_min + int(info[0][2])
+            y_max = y_min + int(info[0][3])
+
+            box_info = " %d,%d,%d,%d,%d" % (
+                x_min, y_min, x_max, y_max, int(info[1]))
+            f.write(box_info)
+        f.write('\n')
+    f.close()
+
+    name_box_id = defaultdict(list)
+    id_name = dict()
+    f = open(val_annotation_path, encoding='utf-8')
+    data = json.load(f)
+
+    annotations = data['annotations']
+    for ant in annotations:
+        id = ant['image_id']
+        name = os.path.join(val_datasets_path, '%012d.jpg' % id)
+        cat = ant['category_id']
+        if cat >= 1 and cat <= 11:
+            cat = cat - 1
+        elif cat >= 13 and cat <= 25:
+            cat = cat - 2
+        elif cat >= 27 and cat <= 28:
+            cat = cat - 3
+        elif cat >= 31 and cat <= 44:
+            cat = cat - 5
+        elif cat >= 46 and cat <= 65:
+            cat = cat - 6
+        elif cat == 67:
+            cat = cat - 7
+        elif cat == 70:
+            cat = cat - 9
+        elif cat >= 72 and cat <= 82:
+            cat = cat - 10
+        elif cat >= 84 and cat <= 90:
+            cat = cat - 11
+        name_box_id[name].append([ant['bbox'], cat])
+
+    f = open(val_output_path, 'w')
+    for key in name_box_id.keys():
+        f.write(key)
+        box_infos = name_box_id[key]
+        for info in box_infos:
+            x_min = int(info[0][0])
+            y_min = int(info[0][1])
+            x_max = x_min + int(info[0][2])
+            y_max = y_min + int(info[0][3])
+
+            box_info = " %d,%d,%d,%d,%d" % (
+                x_min, y_min, x_max, y_max, int(info[1]))
+            f.write(box_info)
+        f.write('\n')
+    f.close()
--- a/utils_coco/get_map_coco.py
+++ b/utils_coco/get_map_coco.py
@ -0,0 +1,116 @@
+import json
+import os
+
+import numpy as np
+import torch
+from PIL import Image
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tqdm import tqdm
+
+from utils.utils import cvtColor, preprocess_input, resize_image
+from yolo import YOLO
+
+# ---------------------------------------------------------------------------#
+#   map_mode用于指定该文件运行时计算的内容
+#   map_mode为0代表整个map计算流程，包括获得预测结果、计算map。
+#   map_mode为1代表仅仅获得预测结果。
+#   map_mode为2代表仅仅获得计算map。
+# ---------------------------------------------------------------------------#
+map_mode = 0
+# -------------------------------------------------------#
+#   指向了验证集标签与图片路径
+# -------------------------------------------------------#
+cocoGt_path = 'coco_dataset/annotations/instances_val2017.json'
+dataset_img_path = 'coco_dataset/val2017'
+# -------------------------------------------------------#
+#   结果输出的文件夹，默认为map_out
+# -------------------------------------------------------#
+temp_save_path = 'map_out/coco_eval'
+
+
+class mAP_YOLO(YOLO):
+    # ---------------------------------------------------#
+    #   检测图片
+    # ---------------------------------------------------#
+    def detect_image(self, image_id, image, results):
+        # ---------------------------------------------------#
+        #   计算输入图片的高和宽
+        # ---------------------------------------------------#
+        image_shape = np.array(np.shape(image)[0:2])
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            # ---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            # ---------------------------------------------------------#
+            outputs = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                         image_shape, self.letterbox_image, conf_thres=self.confidence,
+                                                         nms_thres=self.nms_iou)
+
+            if outputs[0] is None:
+                return results
+
+            top_label = np.array(outputs[0][:, 6], dtype='int32')
+            top_conf = outputs[0][:, 4] * outputs[0][:, 5]
+            top_boxes = outputs[0][:, :4]
+
+        for i, c in enumerate(top_label):
+            result = {}
+            top, left, bottom, right = top_boxes[i]
+
+            result["image_id"] = int(image_id)
+            result["category_id"] = clsid2catid[c]
+            result["bbox"] = [float(left), float(top), float(right - left), float(bottom - top)]
+            result["score"] = float(top_conf[i])
+            results.append(result)
+        return results
+
+
+if __name__ == "__main__":
+    if not os.path.exists(temp_save_path):
+        os.makedirs(temp_save_path)
+
+    cocoGt = COCO(cocoGt_path)
+    ids = list(cocoGt.imgToAnns.keys())
+    clsid2catid = cocoGt.getCatIds()
+
+    if map_mode == 0 or map_mode == 1:
+        yolo = mAP_YOLO(confidence=0.001, nms_iou=0.65)
+
+        with open(os.path.join(temp_save_path, 'eval_results.json'), "w") as f:
+            results = []
+            for image_id in tqdm(ids):
+                image_path = os.path.join(dataset_img_path, cocoGt.loadImgs(image_id)[0]['file_name'])
+                image = Image.open(image_path)
+                results = yolo.detect_image(image_id, image, results)
+            json.dump(results, f)
+
+    if map_mode == 0 or map_mode == 2:
+        cocoDt = cocoGt.loadRes(os.path.join(temp_save_path, 'eval_results.json'))
+        cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+        print("Get map done.")
--- a/voc_annotation.py
+++ b/voc_annotation.py
@ -0,0 +1,158 @@
+import os
+import random
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+from utils.utils import get_classes
+
+# --------------------------------------------------------------------------------------------------------------------------------#
+#   annotation_mode用于指定该文件运行时计算的内容
+#   annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
+#   annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
+#   annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
+# --------------------------------------------------------------------------------------------------------------------------------#
+annotation_mode = 0
+# -------------------------------------------------------------------#
+#   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
+#   与训练和预测所用的classes_path一致即可
+#   如果生成的2007_train.txt里面没有目标信息
+#   那么就是因为classes没有设定正确
+#   仅在annotation_mode为0和2的时候有效
+# -------------------------------------------------------------------#
+classes_path = 'model_data/voc_classes.txt'  # 这里定义的名字是xml的物体的名字，出现的顺序是训练时的onehot顺序。
+# --------------------------------------------------------------------------------------------------------------------------------#
+#   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1
+#   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1
+#   仅在annotation_mode为0和1的时候有效
+# --------------------------------------------------------------------------------------------------------------------------------#
+trainval_percent = 0.9
+train_percent = 0.9
+# -------------------------------------------------------#
+#   指向VOC数据集所在的文件夹
+#   默认指向根目录下的VOC数据集
+# -------------------------------------------------------#
+VOCdevkit_path = 'VOCdevkit'
+
+VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
+classes, _ = get_classes(classes_path)
+
+# -------------------------------------------------------#
+#   统计目标数量
+# -------------------------------------------------------#
+photo_nums = np.zeros(len(VOCdevkit_sets))  # 生成train的数目，val的数目
+nums = np.zeros(len(classes))  # 统计各个类别的数量
+
+
+def convert_annotation(year, image_id, list_file):
+    in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml' % (year, image_id)), encoding='utf-8')  # 'VOCdevkit\\VOC2007/Annotations/000001.xml'
+    tree = ET.parse(in_file)
+    root = tree.getroot()
+
+    for obj in root.iter('object'):
+        difficult = 0
+        if obj.find('difficult') != None:
+            difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult) == 1:  # 不在classes里或者difficult为1，跳过当前类别
+            continue
+        cls_id = classes.index(cls)  # 类别对应于classes文件的下标，是类别的id属性
+        xmlbox = obj.find('bndbox')
+        b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)),
+             int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
+        list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
+        # list_file的每一行，前面先写了图片的全路径，接着一个空格，依次写各个物体的 以,分隔的坐标，和id
+        nums[classes.index(cls)] = nums[classes.index(cls)] + 1  # 统计各个类别的个数
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    if " " in os.path.abspath(VOCdevkit_path):
+        raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格，否则会影响正常的模型训练，请注意修改。")
+
+    if annotation_mode == 0 or annotation_mode == 1:
+        print("Generate txt in ImageSets.")
+        xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
+        saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
+        temp_xml = os.listdir(xmlfilepath)
+        total_xml = [xml for xml in temp_xml if xml.endswith(".xml")]
+
+        num = len(total_xml)  # 取得原始数据集中的总数，从总数中划分数据集
+        list = range(num)
+        tv = int(num * trainval_percent)  # 训练+验证集 总数
+        tr = int(tv * train_percent)  # 训练+验证集中 训练集的总数
+        trainval = random.sample(list, tv)  # 在总数里采样
+        train = random.sample(trainval, tr)  # 在tv中采样tr
+
+        print("train and val size", tv)
+        print("train size", tr)
+        ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
+        ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
+        ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
+        fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
+
+        for i in list:
+            name = total_xml[i][:-4] + '\n'  # 取出除了后缀的文件名字
+            if i in trainval:
+                ftrainval.write(name)
+                if i in train:
+                    ftrain.write(name)
+                else:
+                    fval.write(name)
+            else:
+                ftest.write(name)
+
+        ftrainval.close()
+        ftrain.close()
+        fval.close()
+        ftest.close()
+        print("Generate txt in ImageSets done.")
+
+    if annotation_mode == 0 or annotation_mode == 2:
+        print("Generate 2007_train.txt and 2007_val.txt for train.")
+        type_index = 0
+        for year, image_set in VOCdevkit_sets:
+            image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt' % (year, image_set)),  # 'VOCdevkit\\VOC2007/ImageSets/Main/train.txt'
+                             encoding='utf-8').read().strip().split()
+            list_file = open('%s_%s.txt' % (year, image_set), 'w', encoding='utf-8')  # '2007_train.txt'
+            for image_id in image_ids:
+                list_file.write(  # 'C:\\my_code\\a_python\\YOLO_all\\yolo_v3\\VOCdevkit/VOC2007/JPEGImages/000001.jpg'
+                    '%s/VOC%s/JPEGImages/%s.jpg' % (os.path.abspath(VOCdevkit_path), year, image_id))  # 文件全路径名字是拼出来的
+                convert_annotation(year, image_id, list_file)
+                list_file.write('\n')
+            photo_nums[type_index] = len(image_ids)  # 记录训练集总数和验证集总数
+            type_index += 1  # 用来标记是操作 训练集还是验证集
+            list_file.close()
+        print("Generate 2007_train.txt and 2007_val.txt for train done.")
+
+
+        def printTable(List1, List2):
+            # for i in range(len(List1[0])):
+            for i, _ in enumerate(List1[0]):
+                print("|", end=' ')
+                for j in range(len(List1)):  # len(List1)为2
+                    print(List1[j][i].rjust(int(List2[j])), end=' ')
+                    print("|", end=' ')
+                print()
+
+
+        str_nums = [str(int(x)) for x in nums]  # 每个类别的数目
+        tableData = [
+            classes, str_nums  # 类别与数目对应
+        ]
+        colWidths = [0] * len(tableData)  # 计算列宽，共有len(tableData)列，这里是2
+        len1 = 0
+        for i in range(len(tableData)):
+            for j in range(len(tableData[i])):
+                if len(tableData[i][j]) > colWidths[i]:
+                    colWidths[i] = len(tableData[i][j])  # 每列中，每个元素的最大长度赋值给colWidths
+        printTable(tableData, colWidths)
+
+        if photo_nums[0] <= 500:
+            print("训练集数量小于500，属于较小的数据量，请注意设置较大的训练世代（Epoch）以满足足够的梯度下降次数（Step）。")
+
+        if np.sum(nums) == 0:
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("在数据集中并未获得任何目标，请注意修改classes_path对应自己的数据集，并且保证标签名字正确，否则训练将会没有任何效果！")
+            print("（重要的事情说三遍）。")
--- a/webcam.py
+++ b/webcam.py
@ -0,0 +1,41 @@
+import time
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from yolo import YOLO
+
+yolo = YOLO()
+
+capture = cv2.VideoCapture(0)
+# 1 就是外接摄像头 0 就是自己的摄像头
+ref, frame = capture.read()
+fps = 0.0
+while (True):
+    t1 = time.time()
+    # 读取某一帧
+    ref, frame = capture.read()
+    if not ref:
+        break
+    # 格式转变，BGRtoRGB
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    # 转变成Image
+    frame = Image.fromarray(np.uint8(frame))
+    # 进行检测
+    frame = np.array(yolo.detect_image(frame))
+    # RGBtoBGR满足opencv显示格式
+    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    fps = (fps + (1. / (time.time() - t1))) / 2
+    # print("fps= %.2f" % (fps))
+    frame = cv2.putText(frame, "fps= %.2f" % (fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+
+    cv2.imshow("video", frame)
+    c = cv2.waitKey(1) & 0xff
+    # print(c)
+    if c == 113:
+        capture.release()
+        break
+
+capture.release()
+cv2.destroyAllWindows()
--- a/yolo.py
+++ b/yolo.py
@ -0,0 +1,425 @@
+import colorsys
+import os
+import time
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import ImageDraw, ImageFont
+
+from nets.yolo import YoloBody
+from utils.utils import (cvtColor, get_anchors, get_classes, preprocess_input,
+                         resize_image, show_config)
+from utils.utils_bbox import DecodeBox
+
+'''
+训练自己的数据集必看注释！
+'''
+
+
+class YOLO(object):
+    _defaults = {
+        # --------------------------------------------------------------------------#
+        #   使用自己训练好的模型进行预测一定要修改model_path和classes_path！
+        #   model_path指向logs文件夹下的权值文件，classes_path指向model_data下的txt
+        #
+        #   训练好后logs文件夹下存在多个权值文件，选择验证集损失较低的即可。
+        #   验证集损失较低不代表mAP较高，仅代表该权值在验证集上泛化性能较好。
+        #   如果出现shape不匹配，同时要注意训练时的model_path和classes_path参数的修改
+        # --------------------------------------------------------------------------#
+        # "model_path": 'model_data/yolo_weights.pth',
+        # "classes_path": 'model_data/coco_classes.txt',
+        "model_path": 'logs/best_epoch_weights.pth',
+        "classes_path": 'model_data/cctsdb_classes.txt',
+        # ---------------------------------------------------------------------#
+        #   anchors_path代表先验框对应的txt文件，一般不修改。
+        #   anchors_mask用于帮助代码找到对应的先验框，一般不修改。
+        # ---------------------------------------------------------------------#
+        "anchors_path": 'model_data/yolo_anchors.txt',
+        "anchors_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
+        # ---------------------------------------------------------------------#
+        #   输入图片的大小，必须为32的倍数。
+        # ---------------------------------------------------------------------#
+        "input_shape": [416, 416],
+        # ---------------------------------------------------------------------#
+        #   只有得分大于置信度的预测框会被保留下来
+        # ---------------------------------------------------------------------#
+        "confidence": 0.5,
+        # ---------------------------------------------------------------------#
+        #   非极大抑制所用到的nms_iou大小
+        # ---------------------------------------------------------------------#
+        "nms_iou": 0.3,
+        # ---------------------------------------------------------------------#
+        #   该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize，
+        #   在多次测试后，发现关闭letterbox_image直接resize的效果更好
+        # ---------------------------------------------------------------------#
+        "letterbox_image": False,
+        # -------------------------------#
+        #   是否使用Cuda
+        #   没有GPU可以设置成False
+        # -------------------------------#
+        "cuda": True
+    }
+
+    @classmethod
+    def get_defaults(cls, n):
+        if n in cls._defaults:
+            return cls._defaults[n]
+        else:
+            return "Unrecognized attribute name '" + n + "'"
+
+    # ---------------------------------------------------#
+    #   初始化YOLO
+    # ---------------------------------------------------#
+    def __init__(self, **kwargs):
+        self.__dict__.update(self._defaults)  # 用类的_defaults变量更新当前对象的属性字典
+        for name, value in kwargs.items():
+            setattr(self, name, value)
+
+        # ---------------------------------------------------#
+        #   获得种类和先验框的数量
+        # ---------------------------------------------------#
+        self.class_names, self.num_classes = get_classes(self.classes_path)
+        self.anchors, self.num_anchors = get_anchors(self.anchors_path)
+        self.bbox_util = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]),
+                                   self.anchors_mask)
+
+        # ---------------------------------------------------#
+        #   画框设置不同的颜色
+        # ---------------------------------------------------#
+        hsv_tuples = [(x / self.num_classes, 1., 1.) for x in range(self.num_classes)]
+        self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
+        self.colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors))
+        self.generate()
+
+        show_config(**self._defaults)
+
+    # ---------------------------------------------------#
+    #   生成模型
+    # ---------------------------------------------------#
+    def generate(self, onnx=False):
+        # ---------------------------------------------------#
+        #   建立yolov3模型，载入yolov3模型的权重
+        # ---------------------------------------------------#
+        self.net = YoloBody(self.anchors_mask, self.num_classes)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.net.load_state_dict(torch.load(self.model_path, map_location=device))
+        self.net = self.net.eval()
+        print('{} model, anchors, and classes loaded.'.format(self.model_path))
+        # if not onnx:
+        #     if self.cuda:
+        #         self.net = nn.DataParallel(self.net)
+        #         self.net = self.net.cuda()
+
+        if not onnx:
+            if self.cuda:
+                self.net = self.net.cuda()
+
+    # ---------------------------------------------------#
+    #   检测图片
+    # ---------------------------------------------------#
+    def detect_image(self, image, crop=False, count=False):
+        image_shape = np.array(np.shape(image)[0:2])  # np.shape(image) 的形状 h,w,c
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+        # image_data 变换后的维度是  1, 3, 416, 416
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            # ---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            # ---------------------------------------------------------#
+            results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                         image_shape, self.letterbox_image, conf_thres=self.confidence,
+                                                         nms_thres=self.nms_iou)
+
+            if results[0] is None:
+                return image
+
+            top_label = np.array(results[0][:, 6], dtype='int32')
+            top_conf = results[0][:, 4] * results[0][:, 5]
+            top_boxes = results[0][:, :4]
+        # ---------------------------------------------------------#
+        #   设置字体与边框厚度
+        # ---------------------------------------------------------#
+        font = ImageFont.truetype(font='model_data/simhei.ttf',
+                                  size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
+        thickness = int(max((image.size[0] + image.size[1]) // np.mean(self.input_shape), 1))
+        # ---------------------------------------------------------#
+        #   计数
+        # ---------------------------------------------------------#
+        if count:
+            print("top_label:", top_label)
+            classes_nums = np.zeros([self.num_classes])
+            for i in range(self.num_classes):
+                num = np.sum(top_label == i)
+                if num > 0:
+                    print(self.class_names[i], " : ", num)
+                classes_nums[i] = num
+            print("classes_nums:", classes_nums)
+        # ---------------------------------------------------------#
+        #   是否进行目标的裁剪
+        # ---------------------------------------------------------#
+        if crop:
+            for i, c in list(enumerate(top_label)):
+                top, left, bottom, right = top_boxes[i]
+                top = max(0, np.floor(top).astype('int32'))
+                left = max(0, np.floor(left).astype('int32'))
+                bottom = min(image.size[1], np.floor(bottom).astype('int32'))
+                right = min(image.size[0], np.floor(right).astype('int32'))
+
+                dir_save_path = "img_crop"
+                if not os.path.exists(dir_save_path):
+                    os.makedirs(dir_save_path)
+                crop_image = image.crop([left, top, right, bottom])
+                crop_image.save(os.path.join(dir_save_path, "crop_" + str(i) + ".png"), quality=95, subsampling=0)
+                print("save crop_" + str(i) + ".png to " + dir_save_path)
+        # ---------------------------------------------------------#
+        #   图像绘制
+        # ---------------------------------------------------------#
+        for i, c in list(enumerate(top_label)):
+            predicted_class = self.class_names[int(c)]
+            box = top_boxes[i]
+            score = top_conf[i]
+
+            top, left, bottom, right = box
+
+            top = max(0, np.floor(top).astype('int32'))
+            left = max(0, np.floor(left).astype('int32'))
+            bottom = min(image.size[1], np.floor(bottom).astype('int32'))
+            right = min(image.size[0], np.floor(right).astype('int32'))
+
+            label = '{} {:.2f}'.format(predicted_class, score)
+            draw = ImageDraw.Draw(image)
+            label_size = draw.textsize(label, font)
+            label = label.encode('utf-8')
+            # print(label, top, left, bottom, right)
+
+            if top - label_size[1] >= 0:  # 框到顶的距离大于 label_size，就是可以在顶部放标签
+                text_origin = np.array([left, top - label_size[1]])
+            else:  # 否则放在框内部
+                text_origin = np.array([left, top + 1])
+
+            for i in range(thickness):  # 画粗细的实现？是画6次？
+                draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c])
+            draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c])
+            draw.text(text_origin, str(label, 'UTF-8'), fill=(0, 0, 0), font=font)
+            del draw
+
+        return image
+
+    def get_FPS(self, image, test_interval):
+        image_shape = np.array(np.shape(image)[0:2])
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            # ---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            # ---------------------------------------------------------#
+            results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                         image_shape, self.letterbox_image, conf_thres=self.confidence,
+                                                         nms_thres=self.nms_iou)
+
+        t1 = time.time()
+        for _ in range(test_interval):
+            with torch.no_grad():
+                # ---------------------------------------------------------#
+                #   将图像输入网络当中进行预测！
+                # ---------------------------------------------------------#
+                outputs = self.net(images)
+                outputs = self.bbox_util.decode_box(outputs)
+                # ---------------------------------------------------------#
+                #   将预测框进行堆叠，然后进行非极大抑制
+                # ---------------------------------------------------------#
+                results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                             image_shape, self.letterbox_image,
+                                                             conf_thres=self.confidence, nms_thres=self.nms_iou)
+
+        t2 = time.time()
+        tact_time = (t2 - t1) / test_interval
+        return tact_time
+
+    def detect_heatmap(self, image, heatmap_save_path):
+        import cv2
+        import matplotlib.pyplot as plt
+        def sigmoid(x):
+            y = 1.0 / (1.0 + np.exp(-x))
+            return y
+
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+
+        plt.imshow(image, alpha=1)
+        plt.axis('off')
+        mask = np.zeros((image.size[1], image.size[0]))
+        for sub_output in outputs:
+            sub_output = sub_output.cpu().numpy()
+            b, c, h, w = np.shape(sub_output)
+            sub_output = np.transpose(np.reshape(sub_output, [b, 3, -1, h, w]), [0, 3, 4, 1, 2])[0]
+            score = np.max(sigmoid(sub_output[..., 4]), -1)
+            score = cv2.resize(score, (image.size[0], image.size[1]))
+            normed_score = (score * 255).astype('uint8')
+            mask = np.maximum(mask, normed_score)
+
+        plt.imshow(mask, alpha=0.5, interpolation='nearest', cmap="jet")
+
+        plt.axis('off')
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
+        plt.margins(0, 0)
+        plt.savefig(heatmap_save_path, dpi=200, bbox_inches='tight', pad_inches=-0.1)
+        print("Save to the " + heatmap_save_path)
+        plt.show()
+
+    def convert_to_onnx(self, simplify, model_path):
+        import onnx
+        self.generate(onnx=True)
+
+        im = torch.zeros(1, 3, *self.input_shape).to('cpu')  # image size(1, 3, 512, 512) BCHW
+        input_layer_names = ["images"]
+        output_layer_names = ["output"]
+
+        # Export the model
+        print(f'Starting export with onnx {onnx.__version__}.')
+        torch.onnx.export(self.net,
+                          im,
+                          f=model_path,
+                          verbose=False,
+                          opset_version=12,
+                          training=torch.onnx.TrainingMode.EVAL,
+                          do_constant_folding=True,
+                          input_names=input_layer_names,
+                          output_names=output_layer_names,
+                          dynamic_axes=None)
+
+        # Checks
+        model_onnx = onnx.load(model_path)  # load onnx model
+        onnx.checker.check_model(model_onnx)  # check onnx model
+
+        # Simplify onnx
+        if simplify:
+            import onnxsim
+            print(f'Simplifying with onnx-simplifier {onnxsim.__version__}.')
+            model_onnx, check = onnxsim.simplify(
+                model_onnx,
+                dynamic_input_shape=False,
+                input_shapes=None)
+            assert check, 'assert check failed'
+            onnx.save(model_onnx, model_path)
+
+        print('Onnx model save as {}'.format(model_path))
+
+    def get_map_txt(self, image_id, image, class_names, map_out_path):
+        f = open(os.path.join(map_out_path, "detection-results/" + image_id + ".txt"), "w")
+        image_shape = np.array(np.shape(image)[0:2])
+        # ---------------------------------------------------------#
+        #   在这里将图像转换成RGB图像，防止灰度图在预测时报错。
+        #   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+        # ---------------------------------------------------------#
+        image = cvtColor(image)
+        # ---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #   也可以直接resize进行识别
+        # ---------------------------------------------------------#
+        image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
+        # ---------------------------------------------------------#
+        #   添加上batch_size维度
+        # ---------------------------------------------------------#
+        image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
+
+        with torch.no_grad():
+            images = torch.from_numpy(image_data)
+            if self.cuda:
+                images = images.cuda()
+            # ---------------------------------------------------------#
+            #   将图像输入网络当中进行预测！
+            # ---------------------------------------------------------#
+            outputs = self.net(images)
+            outputs = self.bbox_util.decode_box(outputs)
+            # ---------------------------------------------------------#
+            #   将预测框进行堆叠，然后进行非极大抑制
+            # ---------------------------------------------------------#
+            results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape,
+                                                         image_shape, self.letterbox_image, conf_thres=self.confidence,
+                                                         nms_thres=self.nms_iou)
+
+            if results[0] is None:
+                return
+
+            top_label = np.array(results[0][:, 6], dtype='int32')
+            top_conf = results[0][:, 4] * results[0][:, 5]
+            top_boxes = results[0][:, :4]
+
+        for i, c in list(enumerate(top_label)):
+            predicted_class = self.class_names[int(c)]
+            box = top_boxes[i]
+            score = str(top_conf[i])
+
+            top, left, bottom, right = box
+            if predicted_class not in class_names:
+                continue
+
+            f.write("%s %s %s %s %s %s\n" % (
+                predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)), str(int(bottom))))
+
+        f.close()
+        return