Graduation_Project/LYZ/cnn_gru_clutering.py


# 主网络
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn.functional import normalize
import numpy as np
import pickle
import glob
from timeit import default_timer as timer
import time
import warnings

# 用 BiGRU 网络试试
from model import *

# 不输出“Warning”信息
# warnings.filterwarnings("ignore")
"""
出现过错误的地方，反映了基础不牢
交叉熵：https://blog.csdn.net/dss_dssssd/article/details/84036913
label 转 one-hot：https://blog.csdn.net/qq_34914551/article/details/88700334

实验自适应据类
"""

LSTM_UNITS = 92
MINI_BATCH = 10
TRAIN_STEPS_PER_EPOCH = 12000
VALIDATION_STEPS_PER_EPOCH = 800
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
CHECKPOINTS_DIR = './checkpoints/'

dict_5class = {0: 'Normal', 1: 'BFSSH', 2: 'Infilt', 3: 'HttpDoS', 4: 'DDoS'}
class_num = 5

PACKET_NUM_PER_SESSION = 14
PACKET_LEN = 100


class ByteBlock(nn.Module):
    """
    卷积神经网络
    """
    def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
                 subsample=(2, 1), pool_length=(2, 2)):
        super(ByteBlock, self).__init__()

        layers = []
        for i in range(len(nb_filter)):
            layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
                                    padding=0, stride=subsample[i]))
            layers.append(nn.Tanh())
            if pool_length[i]:
                layers.append(nn.MaxPool1d(pool_length[i]))
            in_channels = nb_filter[i]

        self.block = nn.Sequential(*layers)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(nb_filter[-1], 128)

    def forward(self, x):
        x = self.block(x)
        x = self.global_pool(x).squeeze(dim=2)
        x = torch.nn.functional.relu(self.fc(x))
        return x


def update_confusion_matrix(confusion_matrix, actual_lb, predict_lb):
    for idx, value in enumerate(actual_lb):
        p_value = predict_lb[idx]
        confusion_matrix[value, p_value] += 1
    return confusion_matrix


def truncate(f, n):
    trunc_f = np.math.floor(f * 10 ** n) / 10 ** n
    return '{:.2f}'.format(trunc_f)


def binarize(x, sz=256):
    return torch.nn.functional.one_hot(x, sz).float()


class OneHotEncodingLayer(nn.Module):
    def __init__(self, sz=256):
        super(OneHotEncodingLayer, self).__init__()
        self.size = sz

    def forward(self, x):
        return torch.nn.functional.one_hot(x, num_classes=self.size).float()


class MyDataset(data.Dataset):
    """
    数据转换编码（独热编码）
    """
    def __init__(self, sessions, labels, indices):
        self.sessions = sessions
        self.labels = labels
        self.indices = indices
        self.packet_num_per_session = PACKET_NUM_PER_SESSION
        self.packet_len = PACKET_LEN

    def __getitem__(self, index):
        idx = self.indices[index]
        X = torch.ones((self.packet_num_per_session, self.packet_len), dtype=torch.int64) * -1

        for i, packet in enumerate(self.sessions[idx]):
            if i < self.packet_num_per_session:  # 每个数据会话留只取前 num 个
                for j, byte in enumerate(packet[:self.packet_len]):  # 每个数据包取前 PACKET_LEN 字节
                    X[i, (self.packet_len - 1 - j)] = byte

        # label = self.labels[idx].astype(np.int64)  # CrossEntropyLoss 会自动把标签转换成onehot形式
        # y = torch.nn.functional.one_hot(torch.from_numpy(label), num_classes=5)[0]
        y = self.labels[idx][0].astype(np.int64)
        return X, y

    def __len__(self):
        return len(self.indices)


class FEMnet(nn.Module):
    """
    Feature extraction module 定义
    """
    def __init__(self, flow_len, packet_len, gru_units):
        super(FEMnet, self).__init__()
        self.packet_len = packet_len
        self.flow_len = flow_len
        self.batch_size = 10
        self.gru_hidden_size = gru_units
        self.rep_dim = gru_units

        self.embedding = OneHotEncodingLayer(sz=256)  # 独热编码
        self.block2 = ByteBlock(self.packet_len, (128, 256), (5, 5), (1, 1), (2, 2))
        self.block3 = ByteBlock(self.packet_len, (192, 320), (7, 5), (1, 1), (2, 2))

        self.lstm_layer = nn.GRU(256, self.gru_hidden_size, dropout=0.1, bidirectional=True)
        # self.dense_layer = nn.Linear(self.gru_hidden_size * 2, 5)
        # self.output = nn.Softmax(dim=1)  # Linear 自带Softmax 分类

    def forward(self, x):  # x: [batch_size, flow_len, packet_len] = [10, 14, 100]
        embeddings_list = self.embedding(x)  # [10, 14, 100, 256]
        encoder_list = torch.zeros((self.batch_size, self.flow_len, 256))
        for ix, embeddings in enumerate(embeddings_list):  # [14, 100, 256]
            # print("embeddings 1: ", embeddings.shape)
            # embeddings = embeddings.permute(0, 2, 1)
            # print("embeddings 2: ", embeddings.shape)
            encoder1 = self.block2(embeddings)  # [14, 128]
            # print("encoder 1: ", encoder1.shape)
            encoder2 = self.block3(embeddings)  # [14, 128]
            # print("encoder 2: ", encoder2.shape)
            encoder = torch.cat([encoder1, encoder2], 1)  # [14, 256]
            # print("encoder : ", encoder.shape)
            encoder_list[ix] = encoder

        # rnn 网络输入：[seq_len, batch_size, hidden_size]
        encoder_list = encoder_list.permute(1, 0, 2)  # [10, 14, 256] -> [14, 10 ,256]
        biLSTM, final_hidden_state = self.lstm_layer(encoder_list)  # [14, 10, 184], [2, 10, 92]
        biLSTM = biLSTM.permute(1, 0, 2)  # [10, 14, 184]
        # print("biLSTM: ", biLSTM.shape)
        attn_output, attention = self.attention_net(biLSTM, final_hidden_state)
        dense = self.dense_layer(attn_output)
        # out = self.output(dense)
        # out = dense[:, -1, :]
        return dense

    # lstm_output : [batch_size, n_step, self.byte_hidden_size * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        # print("lstm_output: ", lstm_output.shape)
        # print("final_state: ", final_state.shape)
        # hidden : [batch_size, self.byte_hidden_size * num_directions(=2), 1(=n_layer)]
        hidden = final_state.view(-1, self.gru_hidden_size * 2, 1)  # Tensor维度的重构，-1表示该维度取决于其他维度
        # attn_weights : [batch_size, n_step]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # 加权求和，第三维降维
        soft_attn_weights = torch.nn.functional.softmax(attn_weights, 1)
        # [batch_size, self.byte_hidden_size * num_directions(=2), n_step] * [batch_size, n_step, 1]
        # = [batch_size, self.byte_hidden_size * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context, soft_attn_weights  # context : [batch_size, self.byte_hidden_size * num_directions(=2)]


class MyModel(nn.Module):
    """
    Feature extraction module定义
    """
    def __init__(self, binet, feature_dim, class_n):
        super(MyModel, self).__init__()
        self.binet = binet
        self.feature_dim = feature_dim
        self.cluster_num = class_n
        """ 对比聚类 """
        self.instance_projector = nn.Sequential(  # 实例-单例
            nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
            nn.ReLU(),
            nn.Linear(self.binet.rep_dim, self.feature_dim),
        )
        self.cluster_projector = nn.Sequential(  # 聚类-集群
            nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
            nn.ReLU(),
            nn.Linear(self.binet.rep_dim, self.cluster_num),
            nn.Softmax(dim=1)
        )

    # 训练时执行
    def forward(self, x_i, x_j):  # x_i, x_j 来自同一数据的两种不同预处理方式得到的嵌入矩阵
        h_i = self.resnet(x_i)
        h_j = self.resnet(x_j)

        z_i = normalize(self.instance_projector(h_i), dim=1)
        z_j = normalize(self.instance_projector(h_j), dim=1)

        c_i = self.cluster_projector(h_i)
        c_j = self.cluster_projector(h_j)

        return z_i, z_j, c_i, c_j

    # 测试时执行
    def forward_cluster(self, x):
        h = self.resnet(x)
        c = self.cluster_projector(h)  # 聚类分布
        c = torch.argmax(c, dim=1)  # 得到每个样本的聚类标签。
        return c


def save_result(cf_ma, len_test, epochs=8):
    """
    计算 metrics_list，保存测试/训练结果
    :param cf_ma:
    :param len_test:
    :param epochs:
    :return:
    """
    metrics_list = []
    for i in range(5):
        if i == 0:
            metrics_list.append(
                [dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
                 str(cf_ma[i, 4]), '--', '--', '--'])
        else:
            acc = truncate((float(len_test - cf_ma[:, i].sum() - cf_ma[i, :].sum() + cf_ma[i, i] * 2) / len_test) * 100,
                           2)
            tpr = truncate((float(cf_ma[i, i]) / cf_ma[i].sum()) * 100, 2)
            fpr = truncate((float(cf_ma[0, i]) / cf_ma[0].sum()) * 100, 2)
            metrics_list.append(
                [dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
                 str(cf_ma[i, 4]), str(acc), str(tpr), str(fpr)])
    overall_acc = truncate(
        (float(cf_ma[0, 0] + cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / len_test) * 100, 2)
    overall_tpr = truncate((float(cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / cf_ma[1:].sum()) * 100, 2)
    overall_fpr = truncate((float(cf_ma[0, 1:].sum()) / cf_ma[0, :].sum()) * 100, 2)

    # 获取当前日期
    current_date = time.strftime("%Y%m%d", time.localtime())
    # 构造文件名
    file_name = f"evaluate_{current_date}.txt"

    with open(file_name, 'a') as f:
        f.write("\n")
        t = time.strftime('%Y-%m-%d %X', time.localtime())
        f.write(t + "\n")
        f.write('CLASS_NUM: 5\n')
        f.write('PACKET_LEN: ' + str(PACKET_LEN) + "\n")
        f.write('PACKET_NUM_PER_SESSION: ' + str(PACKET_NUM_PER_SESSION) + "\n")
        f.write('MINI_BATCH: ' + str(MINI_BATCH) + "\n")
        f.write('TRAIN_EPOCHS: ' + str(epochs) + "\n")
        f.write('DATA_DIR: ' + DATA_DIR + "\n")
        f.write("label\tindex\t0\t1\t2\t3\t4\tACC\tTPR\tFPR\n")
        for metrics in metrics_list:
            f.write('\t'.join(metrics) + "\n")
        f.write('Overall accuracy: ' + str(overall_acc) + "\n")
        f.write('Overall TPR: ' + str(overall_tpr) + "\n")
        f.write('Overall FPR: ' + str(overall_fpr) + "\n")
        # f.write('Train time(second): ' + str(int(train_time)) + "\n")
        # f.write('Test time(second): ' + str(int(test_time)) + "\n\n")
        f.write("\n\n")


def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    num = 0
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()  # 梯度清零

        outputs = model(inputs)
        num += 1
        # print(“ {}/{} outputs & label shape: ".format(num,len(dataloader.data_loader)),
        # outputs.shape, labels.shape)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    return running_loss / len(dataloader.dataset)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    cf_ma = np.zeros((5, 5), dtype=int)
    num = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
            true_labels = labels.cpu().numpy()

            cf_ma = update_confusion_matrix(cf_ma, true_labels, predicted_labels)
            num += predicted_labels.shape[0]

    print("num len: ", num)
    save_result(cf_ma, num)
    return running_loss / len(dataloader.dataset), cf_ma


def load_data():
    """
    加载数据
    :return:
    """
    t1 = timer()
    sessions = []
    labels = []
    num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl'))  # 匹配路径
    for i in range(num_pkls):
        # if i != 1:
        #    continue
        session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
        session_lists = pickle.load(open(session_pkl, 'rb'))  # 反序列化对象
        sessions.extend(session_lists.values.tolist())  # 追加元素

        label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
        label_lists = pickle.load(open(label_pkl, 'rb'))
        labels.extend(label_lists.values.tolist())
        print(i)
    t2 = timer()
    print("load data tims: ", t2 - t1)

    labels = np.array(labels)
    normal_indices = np.where(labels == 0)[0]  # 结果所在的 行，是个array
    # 数据量太大，不好训练。以下注释代码可以选择100000条正常流量进入训练（建议在数据预处理阶段选择一定数量的正常数据——节约内存开支）
    normal_indices = np.random.choice(normal_indices, 100000, replace=False)  # 注释代码
    attack_indices = [np.where(labels == i)[0] for i in range(1, 5)]  # label 1~4 所在行，是个 list
    # np.random.choice 会重复抽样， 若想不重复，增加参数：replace=False
    test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
    test_attack_indices = np.concatenate(  # 模态融合
        [np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
    test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
    # train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
    attack_indices = np.concatenate(attack_indices).astype(int)  # 注释代码
    indices = np.concatenate([normal_indices, attack_indices]).astype(int)  # 注释代码
    train_indices = np.array(list(set(indices) - set(test_indices)))  # 注释代码

    return sessions, labels, train_indices, test_indices


# 定义超参数
NUM_EPOCHS = 8
LEARNING_RATE = 0.001

# 创建模型实例
fe_model = FEMnet(PACKET_NUM_PER_SESSION, PACKET_LEN, LSTM_UNITS)
model = MyModel(fe_model, 128, class_num)
# model = PLA_Attention_Model(100, 100, 50, 100, 14)

# 将模型移到GPU上（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 创建训练集和验证集的数据加载器
sessions, labels, train_indices, test_indices = load_data()

train_dataset = MyDataset(sessions, labels, train_indices)  # # num_workers=2，Windows不允许多个进程加载数据
train_dataloader = data.DataLoader(train_dataset, batch_size=MINI_BATCH, shuffle=True, drop_last=True)

val_dataset = MyDataset(sessions, labels, test_indices)  # 丢弃不成 batch 数据，否则会报错
val_dataloader = data.DataLoader(val_dataset, batch_size=MINI_BATCH, shuffle=False, drop_last=True)

# 开始训练
start_time = timer()

for epoch in range(NUM_EPOCHS):
    # 训练模型
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    # print(f"Train Loss: {truncate(train_loss, 4)}")
    # 在验证集上评估模型
    val_loss, confusion_matrix = evaluate(model, val_dataloader, criterion, device)

    # 输出训练和验证的损失以及混淆矩阵
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    print(f"Train Loss: {truncate(train_loss, 4)}")
    print(f"Validation Loss: {truncate(val_loss, 4)}")
    print("Confusion Matrix:")
    for i in range(5):
        for j in range(5):
            print(confusion_matrix[i][j], end="\t")
        print()
    print("---------------------------------")

end_time = timer()
training_time = end_time - start_time

print(f"Training Time: {truncate(training_time, 2)} seconds")