Graduation_Project/LYZ/cnn_gru_clutering.py

423 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 主网络
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn.functional import normalize
import numpy as np
import pickle
import glob
from timeit import default_timer as timer
import time
import warnings
# 用 BiGRU 网络试试
from model import *
# 不输出“Warning”信息
# warnings.filterwarnings("ignore")
"""
出现过错误的地方,反映了基础不牢
交叉熵https://blog.csdn.net/dss_dssssd/article/details/84036913
label 转 one-hothttps://blog.csdn.net/qq_34914551/article/details/88700334
实验自适应据类
"""
LSTM_UNITS = 92
MINI_BATCH = 10
TRAIN_STEPS_PER_EPOCH = 12000
VALIDATION_STEPS_PER_EPOCH = 800
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
CHECKPOINTS_DIR = './checkpoints/'
dict_5class = {0: 'Normal', 1: 'BFSSH', 2: 'Infilt', 3: 'HttpDoS', 4: 'DDoS'}
class_num = 5
PACKET_NUM_PER_SESSION = 14
PACKET_LEN = 100
class ByteBlock(nn.Module):
"""
卷积神经网络
"""
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
subsample=(2, 1), pool_length=(2, 2)):
super(ByteBlock, self).__init__()
layers = []
for i in range(len(nb_filter)):
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
padding=0, stride=subsample[i]))
layers.append(nn.Tanh())
if pool_length[i]:
layers.append(nn.MaxPool1d(pool_length[i]))
in_channels = nb_filter[i]
self.block = nn.Sequential(*layers)
self.global_pool = nn.AdaptiveMaxPool1d(1)
self.fc = nn.Linear(nb_filter[-1], 128)
def forward(self, x):
x = self.block(x)
x = self.global_pool(x).squeeze(dim=2)
x = torch.nn.functional.relu(self.fc(x))
return x
def update_confusion_matrix(confusion_matrix, actual_lb, predict_lb):
for idx, value in enumerate(actual_lb):
p_value = predict_lb[idx]
confusion_matrix[value, p_value] += 1
return confusion_matrix
def truncate(f, n):
trunc_f = np.math.floor(f * 10 ** n) / 10 ** n
return '{:.2f}'.format(trunc_f)
def binarize(x, sz=256):
return torch.nn.functional.one_hot(x, sz).float()
class OneHotEncodingLayer(nn.Module):
def __init__(self, sz=256):
super(OneHotEncodingLayer, self).__init__()
self.size = sz
def forward(self, x):
return torch.nn.functional.one_hot(x, num_classes=self.size).float()
class MyDataset(data.Dataset):
"""
数据转换编码(独热编码)
"""
def __init__(self, sessions, labels, indices):
self.sessions = sessions
self.labels = labels
self.indices = indices
self.packet_num_per_session = PACKET_NUM_PER_SESSION
self.packet_len = PACKET_LEN
def __getitem__(self, index):
idx = self.indices[index]
X = torch.ones((self.packet_num_per_session, self.packet_len), dtype=torch.int64) * -1
for i, packet in enumerate(self.sessions[idx]):
if i < self.packet_num_per_session: # 每个数据会话留只取前 num 个
for j, byte in enumerate(packet[:self.packet_len]): # 每个数据包取前 PACKET_LEN 字节
X[i, (self.packet_len - 1 - j)] = byte
# label = self.labels[idx].astype(np.int64) # CrossEntropyLoss 会自动把标签转换成onehot形式
# y = torch.nn.functional.one_hot(torch.from_numpy(label), num_classes=5)[0]
y = self.labels[idx][0].astype(np.int64)
return X, y
def __len__(self):
return len(self.indices)
class FEMnet(nn.Module):
"""
Feature extraction module 定义
"""
def __init__(self, flow_len, packet_len, gru_units):
super(FEMnet, self).__init__()
self.packet_len = packet_len
self.flow_len = flow_len
self.batch_size = 10
self.gru_hidden_size = gru_units
self.rep_dim = gru_units
self.embedding = OneHotEncodingLayer(sz=256) # 独热编码
self.block2 = ByteBlock(self.packet_len, (128, 256), (5, 5), (1, 1), (2, 2))
self.block3 = ByteBlock(self.packet_len, (192, 320), (7, 5), (1, 1), (2, 2))
self.lstm_layer = nn.GRU(256, self.gru_hidden_size, dropout=0.1, bidirectional=True)
# self.dense_layer = nn.Linear(self.gru_hidden_size * 2, 5)
# self.output = nn.Softmax(dim=1) # Linear 自带Softmax 分类
def forward(self, x): # x: [batch_size, flow_len, packet_len] = [10, 14, 100]
embeddings_list = self.embedding(x) # [10, 14, 100, 256]
encoder_list = torch.zeros((self.batch_size, self.flow_len, 256))
for ix, embeddings in enumerate(embeddings_list): # [14, 100, 256]
# print("embeddings 1: ", embeddings.shape)
# embeddings = embeddings.permute(0, 2, 1)
# print("embeddings 2: ", embeddings.shape)
encoder1 = self.block2(embeddings) # [14, 128]
# print("encoder 1: ", encoder1.shape)
encoder2 = self.block3(embeddings) # [14, 128]
# print("encoder 2: ", encoder2.shape)
encoder = torch.cat([encoder1, encoder2], 1) # [14, 256]
# print("encoder : ", encoder.shape)
encoder_list[ix] = encoder
# rnn 网络输入:[seq_len, batch_size, hidden_size]
encoder_list = encoder_list.permute(1, 0, 2) # [10, 14, 256] -> [14, 10 ,256]
biLSTM, final_hidden_state = self.lstm_layer(encoder_list) # [14, 10, 184], [2, 10, 92]
biLSTM = biLSTM.permute(1, 0, 2) # [10, 14, 184]
# print("biLSTM: ", biLSTM.shape)
attn_output, attention = self.attention_net(biLSTM, final_hidden_state)
dense = self.dense_layer(attn_output)
# out = self.output(dense)
# out = dense[:, -1, :]
return dense
# lstm_output : [batch_size, n_step, self.byte_hidden_size * num_directions(=2)], F matrix
def attention_net(self, lstm_output, final_state):
# print("lstm_output: ", lstm_output.shape)
# print("final_state: ", final_state.shape)
# hidden : [batch_size, self.byte_hidden_size * num_directions(=2), 1(=n_layer)]
hidden = final_state.view(-1, self.gru_hidden_size * 2, 1) # Tensor维度的重构-1表示该维度取决于其他维度
# attn_weights : [batch_size, n_step]
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # 加权求和,第三维降维
soft_attn_weights = torch.nn.functional.softmax(attn_weights, 1)
# [batch_size, self.byte_hidden_size * num_directions(=2), n_step] * [batch_size, n_step, 1]
# = [batch_size, self.byte_hidden_size * num_directions(=2), 1]
context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return context, soft_attn_weights # context : [batch_size, self.byte_hidden_size * num_directions(=2)]
class MyModel(nn.Module):
"""
Feature extraction module定义
"""
def __init__(self, binet, feature_dim, class_n):
super(MyModel, self).__init__()
self.binet = binet
self.feature_dim = feature_dim
self.cluster_num = class_n
""" 对比聚类 """
self.instance_projector = nn.Sequential( # 实例-单例
nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
nn.ReLU(),
nn.Linear(self.binet.rep_dim, self.feature_dim),
)
self.cluster_projector = nn.Sequential( # 聚类-集群
nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
nn.ReLU(),
nn.Linear(self.binet.rep_dim, self.cluster_num),
nn.Softmax(dim=1)
)
# 训练时执行
def forward(self, x_i, x_j): # x_i, x_j 来自同一数据的两种不同预处理方式得到的嵌入矩阵
h_i = self.resnet(x_i)
h_j = self.resnet(x_j)
z_i = normalize(self.instance_projector(h_i), dim=1)
z_j = normalize(self.instance_projector(h_j), dim=1)
c_i = self.cluster_projector(h_i)
c_j = self.cluster_projector(h_j)
return z_i, z_j, c_i, c_j
# 测试时执行
def forward_cluster(self, x):
h = self.resnet(x)
c = self.cluster_projector(h) # 聚类分布
c = torch.argmax(c, dim=1) # 得到每个样本的聚类标签。
return c
def save_result(cf_ma, len_test, epochs=8):
"""
计算 metrics_list保存测试/训练结果
:param cf_ma:
:param len_test:
:param epochs:
:return:
"""
metrics_list = []
for i in range(5):
if i == 0:
metrics_list.append(
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
str(cf_ma[i, 4]), '--', '--', '--'])
else:
acc = truncate((float(len_test - cf_ma[:, i].sum() - cf_ma[i, :].sum() + cf_ma[i, i] * 2) / len_test) * 100,
2)
tpr = truncate((float(cf_ma[i, i]) / cf_ma[i].sum()) * 100, 2)
fpr = truncate((float(cf_ma[0, i]) / cf_ma[0].sum()) * 100, 2)
metrics_list.append(
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
str(cf_ma[i, 4]), str(acc), str(tpr), str(fpr)])
overall_acc = truncate(
(float(cf_ma[0, 0] + cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / len_test) * 100, 2)
overall_tpr = truncate((float(cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / cf_ma[1:].sum()) * 100, 2)
overall_fpr = truncate((float(cf_ma[0, 1:].sum()) / cf_ma[0, :].sum()) * 100, 2)
# 获取当前日期
current_date = time.strftime("%Y%m%d", time.localtime())
# 构造文件名
file_name = f"evaluate_{current_date}.txt"
with open(file_name, 'a') as f:
f.write("\n")
t = time.strftime('%Y-%m-%d %X', time.localtime())
f.write(t + "\n")
f.write('CLASS_NUM: 5\n')
f.write('PACKET_LEN: ' + str(PACKET_LEN) + "\n")
f.write('PACKET_NUM_PER_SESSION: ' + str(PACKET_NUM_PER_SESSION) + "\n")
f.write('MINI_BATCH: ' + str(MINI_BATCH) + "\n")
f.write('TRAIN_EPOCHS: ' + str(epochs) + "\n")
f.write('DATA_DIR: ' + DATA_DIR + "\n")
f.write("label\tindex\t0\t1\t2\t3\t4\tACC\tTPR\tFPR\n")
for metrics in metrics_list:
f.write('\t'.join(metrics) + "\n")
f.write('Overall accuracy: ' + str(overall_acc) + "\n")
f.write('Overall TPR: ' + str(overall_tpr) + "\n")
f.write('Overall FPR: ' + str(overall_fpr) + "\n")
# f.write('Train time(second): ' + str(int(train_time)) + "\n")
# f.write('Test time(second): ' + str(int(test_time)) + "\n\n")
f.write("\n\n")
def train(model, dataloader, criterion, optimizer, device):
model.train()
running_loss = 0.0
num = 0
for inputs, labels in dataloader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad() # 梯度清零
outputs = model(inputs)
num += 1
# print(“ {}/{} outputs & label shape: ".format(num,len(dataloader.data_loader)),
# outputs.shape, labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
return running_loss / len(dataloader.dataset)
def evaluate(model, dataloader, criterion, device):
model.eval()
running_loss = 0.0
cf_ma = np.zeros((5, 5), dtype=int)
num = 0
with torch.no_grad():
for inputs, labels in dataloader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item() * inputs.size(0)
predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
true_labels = labels.cpu().numpy()
cf_ma = update_confusion_matrix(cf_ma, true_labels, predicted_labels)
num += predicted_labels.shape[0]
print("num len: ", num)
save_result(cf_ma, num)
return running_loss / len(dataloader.dataset), cf_ma
def load_data():
"""
加载数据
:return:
"""
t1 = timer()
sessions = []
labels = []
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
for i in range(num_pkls):
# if i != 1:
# continue
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
sessions.extend(session_lists.values.tolist()) # 追加元素
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
label_lists = pickle.load(open(label_pkl, 'rb'))
labels.extend(label_lists.values.tolist())
print(i)
t2 = timer()
print("load data tims: ", t2 - t1)
labels = np.array(labels)
normal_indices = np.where(labels == 0)[0] # 结果所在的 行是个array
# 数据量太大不好训练。以下注释代码可以选择100000条正常流量进入训练建议在数据预处理阶段选择一定数量的正常数据——节约内存开支
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行,是个 list
# np.random.choice 会重复抽样, 若想不重复增加参数replace=False
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
test_attack_indices = np.concatenate( # 模态融合
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
return sessions, labels, train_indices, test_indices
# 定义超参数
NUM_EPOCHS = 8
LEARNING_RATE = 0.001
# 创建模型实例
fe_model = FEMnet(PACKET_NUM_PER_SESSION, PACKET_LEN, LSTM_UNITS)
model = MyModel(fe_model, 128, class_num)
# model = PLA_Attention_Model(100, 100, 50, 100, 14)
# 将模型移到GPU上如果可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# 创建训练集和验证集的数据加载器
sessions, labels, train_indices, test_indices = load_data()
train_dataset = MyDataset(sessions, labels, train_indices) # # num_workers=2Windows不允许多个进程加载数据
train_dataloader = data.DataLoader(train_dataset, batch_size=MINI_BATCH, shuffle=True, drop_last=True)
val_dataset = MyDataset(sessions, labels, test_indices) # 丢弃不成 batch 数据,否则会报错
val_dataloader = data.DataLoader(val_dataset, batch_size=MINI_BATCH, shuffle=False, drop_last=True)
# 开始训练
start_time = timer()
for epoch in range(NUM_EPOCHS):
# 训练模型
train_loss = train(model, train_dataloader, criterion, optimizer, device)
# print(f"Train Loss: {truncate(train_loss, 4)}")
# 在验证集上评估模型
val_loss, confusion_matrix = evaluate(model, val_dataloader, criterion, device)
# 输出训练和验证的损失以及混淆矩阵
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
print(f"Train Loss: {truncate(train_loss, 4)}")
print(f"Validation Loss: {truncate(val_loss, 4)}")
print("Confusion Matrix:")
for i in range(5):
for j in range(5):
print(confusion_matrix[i][j], end="\t")
print()
print("---------------------------------")
end_time = timer()
training_time = end_time - start_time
print(f"Training Time: {truncate(training_time, 2)} seconds")