Graduation_Project/LYZ/cnn_gru.py


# 主网络
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
import pickle
import glob
from timeit import default_timer as timer
import time
import warnings

# 用 BiGRU 网络试试
from model import *

# 不输出“Warning”信息
# warnings.filterwarnings("ignore")
"""
出现过错误的地方，反映了基础不牢
交叉熵：https://blog.csdn.net/dss_dssssd/article/details/84036913
label 转 one-hot：https://blog.csdn.net/qq_34914551/article/details/88700334
"""

LSTM_UNITS = 92
MINI_BATCH = 10
TRAIN_STEPS_PER_EPOCH = 12000
VALIDATION_STEPS_PER_EPOCH = 800
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
CHECKPOINTS_DIR = './checkpoints/'

dict_5class = {0: 'Normal', 1: 'BFSSH', 2: 'Infilt', 3: 'HttpDoS', 4: 'DDoS'}

PACKET_NUM_PER_SESSION = 14
PACKET_LEN = 100


class ByteBlock(nn.Module):
    """
    卷积神经网络
    """
    def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
                 subsample=(2, 1), pool_length=(2, 2)):
        super(ByteBlock, self).__init__()

        layers = []
        for i in range(len(nb_filter)):
            layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
                                    padding=0, stride=subsample[i]))
            layers.append(nn.Tanh())
            if pool_length[i]:
                layers.append(nn.MaxPool1d(pool_length[i]))
            in_channels = nb_filter[i]

        self.block = nn.Sequential(*layers)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(nb_filter[-1], 128)

    def forward(self, x):
        x = self.block(x)
        x = self.global_pool(x).squeeze(dim=2)
        x = torch.nn.functional.relu(self.fc(x))
        return x


class FlowDataset(data.Dataset):
    def __init__(self, sessions, labels, indices):
        self.sessions = sessions[indices]
        self.labels = labels[indices]
        self.indices = indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, index):
        idx = self.indices[index]
        session = torch.tensor(self.sessions[idx], dtype=torch.int64)
        label = torch.tensor(self.labels[idx], dtype=torch.int64)
        return session, label


def update_confusion_matrix(confusion_matrix, actual_lb, predict_lb):
    for idx, value in enumerate(actual_lb):
        p_value = predict_lb[idx]
        confusion_matrix[value, p_value] += 1
    return confusion_matrix


def truncate(f, n):
    trunc_f = np.math.floor(f * 10 ** n) / 10 ** n
    return '{:.2f}'.format(trunc_f)


def binarize(x, sz=256):
    return torch.nn.functional.one_hot(x, sz).float()


class OneHotEncodingLayer(nn.Module):
    def __init__(self, sz=256):
        super(OneHotEncodingLayer, self).__init__()
        self.size = sz

    def forward(self, x):
        return torch.nn.functional.one_hot(x, num_classes=self.size).float()


"""
# 可以改为 FlowDataLoader 中数据
def mini_batch_generator(dataset, batch_size):
    data_loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    while True:
        for sessions, labels in data_loader:
            Xbatch = torch.ones((batch_size, PACKET_NUM_PER_SESSION, PACKET_LEN), dtype=torch.int64) * -1
            Ybatch = torch.ones((batch_size, 5), dtype=torch.int64) * -1
            for batch_idx, (session, label) in enumerate(zip(sessions, labels)):
                for i, packet in enumerate(session):
                    if i < PACKET_NUM_PER_SESSION:
                        for j, byte in enumerate(packet[:PACKET_LEN]):
                            Xbatch[batch_idx, i, (PACKET_LEN - 1 - j)] = byte
                Ybatch[batch_idx] = torch.nn.functional.one_hot(label, num_classes=5)[0]
            yield Xbatch, Ybatch
"""


class MyDataset(data.Dataset):
    """
    数据转换编码（独热编码）
    """
    def __init__(self, sessions, labels, indices):
        self.sessions = sessions
        self.labels = labels
        self.indices = indices
        self.packet_num_per_session = PACKET_NUM_PER_SESSION
        self.packet_len = PACKET_LEN

    def __getitem__(self, index):
        idx = self.indices[index]
        X = torch.ones((self.packet_num_per_session, self.packet_len), dtype=torch.int64) * -1

        for i, packet in enumerate(self.sessions[idx]):
            if i < self.packet_num_per_session:
                for j, byte in enumerate(packet[:self.packet_len]):
                    X[i, (self.packet_len - 1 - j)] = byte

        # label = self.labels[idx].astype(np.int64)  # CrossEntropyLoss 会自动把标签转换成onehot形式
        # y = torch.nn.functional.one_hot(torch.from_numpy(label), num_classes=5)[0]
        y = self.labels[idx][0].astype(np.int64)
        return X, y

    def __len__(self):
        return len(self.indices)


class MyModel(nn.Module):
    """
    模型定义
    """
    def __init__(self, flow_len, packet_len, gru_units):
        super(MyModel, self).__init__()
        self.packet_len = packet_len
        self.flow_len = flow_len
        self.batch_size = 10
        self.gru_hidden_size = gru_units

        self.embedding = OneHotEncodingLayer(sz=256)  # 独热编码
        self.block2 = ByteBlock(self.packet_len, (128, 256), (5, 5), (1, 1), (2, 2))
        self.block3 = ByteBlock(self.packet_len, (192, 320), (7, 5), (1, 1), (2, 2))

        self.lstm_layer = nn.GRU(256, self.gru_hidden_size, dropout=0.1, bidirectional=True)
        self.dense_layer = nn.Linear(self.gru_hidden_size * 2, 5)  #  # 其实像特征聚类，输出聚类数
        # self.output = nn.Softmax(dim=1)

    def forward(self, x):  # x: [batch_size, flow_len, packet_len] = [10, 14, 100]
        embeddings_list = self.embedding(x)  # [10, 14, 100, 256]
        encoder_list = torch.zeros((self.batch_size, self.flow_len, 256))
        for ix, embeddings in enumerate(embeddings_list):  # [14, 100, 256]
            # print("embeddings 1: ", embeddings.shape)
            # embeddings = embeddings.permute(0, 2, 1)
            # print("embeddings 2: ", embeddings.shape)
            encoder1 = self.block2(embeddings)  # [14, 128]
            # print("encoder 1: ", encoder1.shape)
            encoder2 = self.block3(embeddings)  # [14, 128]
            # print("encoder 2: ", encoder2.shape)
            encoder = torch.cat([encoder1, encoder2], 1)  # [14, 256]
            # print("encoder : ", encoder.shape)
            encoder_list[ix] = encoder

        # rnn 网络输入：[seq_len, batch_size, hidden_size]
        encoder_list = encoder_list.permute(1, 0, 2)  # [10, 14, 256] -> [14, 10 ,256]
        biLSTM, final_hidden_state = self.lstm_layer(encoder_list)  # [14, 10, 184], [2, 10, 92]
        biLSTM = biLSTM.permute(1, 0, 2)  # [10, 14, 184]
        # print("biLSTM: ", biLSTM.shape)
        attn_output, attention = self.attention_net(biLSTM, final_hidden_state)
        dense = self.dense_layer(attn_output)
        # out = self.output(dense)
        # out = dense[:, -1, :]
        return dense

    # lstm_output : [batch_size, n_step, self.byte_hidden_size * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        # print("lstm_output: ", lstm_output.shape)
        # print("final_state: ", final_state.shape)
        # hidden : [batch_size, self.byte_hidden_size * num_directions(=2), 1(=n_layer)]
        hidden = final_state.view(-1, self.gru_hidden_size * 2, 1)  # Tensor维度的重构，-1表示该维度取决于其他维度
        # attn_weights : [batch_size, n_step]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # 加权求和，第三维降维
        soft_attn_weights = torch.nn.functional.softmax(attn_weights, 1)
        # [batch_size, self.byte_hidden_size * num_directions(=2), n_step] * [batch_size, n_step, 1]
        # = [batch_size, self.byte_hidden_size * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context, soft_attn_weights  # context : [batch_size, self.byte_hidden_size * num_directions(=2)]


def save_result(cf_ma, len_test, epochs=8):
    """
    计算metrics_list，保存测试/训练结果
    :param cf_ma:
    :param len_test:
    :param epochs:
    :return:
    """
    metrics_list = []
    for i in range(5):
        if i == 0:
            metrics_list.append(
                [dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
                 str(cf_ma[i, 4]), '--', '--', '--'])
        else:
            acc = truncate((float(len_test - cf_ma[:, i].sum() - cf_ma[i, :].sum() + cf_ma[i, i] * 2) / len_test) * 100,
                           2)
            tpr = truncate((float(cf_ma[i, i]) / cf_ma[i].sum()) * 100, 2)
            fpr = truncate((float(cf_ma[0, i]) / cf_ma[0].sum()) * 100, 2)
            metrics_list.append(
                [dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
                 str(cf_ma[i, 4]), str(acc), str(tpr), str(fpr)])
    overall_acc = truncate(
        (float(cf_ma[0, 0] + cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / len_test) * 100, 2)
    overall_tpr = truncate((float(cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / cf_ma[1:].sum()) * 100, 2)
    overall_fpr = truncate((float(cf_ma[0, 1:].sum()) / cf_ma[0, :].sum()) * 100, 2)

    with open('iscx12_cnn_rnn_5class_attn.txt', 'a') as f:
        f.write("\n")
        t = time.strftime('%Y-%m-%d %X', time.localtime())
        f.write(t + "\n")
        f.write('CLASS_NUM: 5\n')
        f.write('PACKET_LEN: ' + str(PACKET_LEN) + "\n")
        f.write('PACKET_NUM_PER_SESSION: ' + str(PACKET_NUM_PER_SESSION) + "\n")
        f.write('MINI_BATCH: ' + str(MINI_BATCH) + "\n")
        f.write('TRAIN_EPOCHS: ' + str(epochs) + "\n")
        f.write('DATA_DIR: ' + DATA_DIR + "\n")
        f.write("label\tindex\t0\t1\t2\t3\t4\tACC\tTPR\tFPR\n")
        for metrics in metrics_list:
            f.write('\t'.join(metrics) + "\n")
        f.write('Overall accuracy: ' + str(overall_acc) + "\n")
        f.write('Overall TPR: ' + str(overall_tpr) + "\n")
        f.write('Overall FPR: ' + str(overall_fpr) + "\n")
        # f.write('Train time(second): ' + str(int(train_time)) + "\n")
        # f.write('Test time(second): ' + str(int(test_time)) + "\n\n")
        f.write("\n\n")


def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    num = 0
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()  # 梯度清零

        outputs = model(inputs)
        num += 1
        # print(“ {}/{} outputs & label shape: ".format(num,len(dataloader.data_loader)), outputs.shape, labels.shape)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    return running_loss / len(dataloader.dataset)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    cf_ma = np.zeros((5, 5), dtype=int)
    num = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
            true_labels = labels.cpu().numpy()

            cf_ma = update_confusion_matrix(cf_ma, true_labels, predicted_labels)
            num += predicted_labels.shape[0]

    print("num len: ", num)
    save_result(cf_ma, num)
    return running_loss / len(dataloader.dataset), cf_ma


def load_data():
    """
    加载数据
    :return:
    """
    t1 = timer()
    sessions = []
    labels = []
    num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl'))  # 匹配路径
    for i in range(num_pkls):
        # if i != 1:
        #    continue
        session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
        session_lists = pickle.load(open(session_pkl, 'rb'))  # 反序列化对象
        sessions.extend(session_lists.values.tolist())  # 追加元素

        label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
        label_lists = pickle.load(open(label_pkl, 'rb'))
        labels.extend(label_lists.values.tolist())
        print(i)
    t2 = timer()
    print("load data tims: ", t2 - t1)

    labels = np.array(labels)
    normal_indices = np.where(labels == 0)[0]  # 结果所在的 行，是个array
    # 数据量太大，不好训练。以下注释代码可以选择100000条正常流量进入训练（建议在数据预处理阶段选择一定数量的正常数据——节约内存开支）
    normal_indices = np.random.choice(normal_indices, 100000, replace=False)  # 注释代码
    attack_indices = [np.where(labels == i)[0] for i in range(1, 5)]  # label 1~4 所在行，是个 list
    # np.random.choice 会重复抽样， 若想不重复，增加参数：replace=False
    test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
    test_attack_indices = np.concatenate(  # 模态融合
        [np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
    test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
    # train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
    attack_indices = np.concatenate(attack_indices).astype(int)  # 注释代码
    indices = np.concatenate([normal_indices, attack_indices]).astype(int)  # 注释代码
    train_indices = np.array(list(set(indices) - set(test_indices)))  # 注释代码

    return sessions, labels, train_indices, test_indices


# 定义超参数
NUM_EPOCHS = 8
LEARNING_RATE = 0.001

# 创建模型实例
model = MyModel(PACKET_NUM_PER_SESSION, PACKET_LEN, LSTM_UNITS)
# model = PLA_Attention_Model(100, 100, 50, 100, 14)

# 将模型移到GPU上（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 创建训练集和验证集的数据加载器
sessions, labels, train_indices, test_indices = load_data()

train_dataset = MyDataset(sessions, labels, train_indices)  # # num_workers=2，Windows不允许多个进程加载数据
train_dataloader = data.DataLoader(train_dataset, batch_size=MINI_BATCH, shuffle=True, drop_last=True)

val_dataset = MyDataset(sessions, labels, test_indices)  # 丢弃不成 batch 数据，否则会报错
val_dataloader = data.DataLoader(val_dataset, batch_size=MINI_BATCH, shuffle=False, drop_last=True)

# 开始训练
start_time = timer()

for epoch in range(NUM_EPOCHS):
    # 训练模型
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    # print(f"Train Loss: {truncate(train_loss, 4)}")
    # 在验证集上评估模型
    val_loss, confusion_matrix = evaluate(model, val_dataloader, criterion, device)

    # 输出训练和验证的损失以及混淆矩阵
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"Train Loss: {truncate(train_loss, 4)}")
    print(f"Validation Loss: {truncate(val_loss, 4)}")
    print("Confusion Matrix:")
    for i in range(5):
        for j in range(5):
            print(confusion_matrix[i][j], end="\t")
        print()
    print("---------------------------------")

end_time = timer()
training_time = end_time - start_time

print(f"Training Time: {truncate(training_time, 2)} seconds")

"""
Epoch 7/10
Train Loss: 0.35
Validation Loss: 0.35
Confusion Matrix:
39794	43	45	20	98	
472	2272	1	0	0	
3832	1	31	3	0	
1166	0	0	136	0	
56	0	1	0	8219	
---------------------------------
Train Loss: 0.35
"""