Graduation_Project/LYZ/cnn_gru.py

415 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 主网络
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
import pickle
import glob
from timeit import default_timer as timer
import time
import warnings
# 用 BiGRU 网络试试
from model import *
# 不输出“Warning”信息
# warnings.filterwarnings("ignore")
"""
出现过错误的地方,反映了基础不牢
交叉熵https://blog.csdn.net/dss_dssssd/article/details/84036913
label 转 one-hothttps://blog.csdn.net/qq_34914551/article/details/88700334
"""
LSTM_UNITS = 92
MINI_BATCH = 10
TRAIN_STEPS_PER_EPOCH = 12000
VALIDATION_STEPS_PER_EPOCH = 800
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
CHECKPOINTS_DIR = './checkpoints/'
dict_5class = {0: 'Normal', 1: 'BFSSH', 2: 'Infilt', 3: 'HttpDoS', 4: 'DDoS'}
PACKET_NUM_PER_SESSION = 14
PACKET_LEN = 100
class ByteBlock(nn.Module):
"""
卷积神经网络
"""
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
subsample=(2, 1), pool_length=(2, 2)):
super(ByteBlock, self).__init__()
layers = []
for i in range(len(nb_filter)):
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
padding=0, stride=subsample[i]))
layers.append(nn.Tanh())
if pool_length[i]:
layers.append(nn.MaxPool1d(pool_length[i]))
in_channels = nb_filter[i]
self.block = nn.Sequential(*layers)
self.global_pool = nn.AdaptiveMaxPool1d(1)
self.fc = nn.Linear(nb_filter[-1], 128)
def forward(self, x):
x = self.block(x)
x = self.global_pool(x).squeeze(dim=2)
x = torch.nn.functional.relu(self.fc(x))
return x
class FlowDataset(data.Dataset):
def __init__(self, sessions, labels, indices):
self.sessions = sessions[indices]
self.labels = labels[indices]
self.indices = indices
def __len__(self):
return len(self.indices)
def __getitem__(self, index):
idx = self.indices[index]
session = torch.tensor(self.sessions[idx], dtype=torch.int64)
label = torch.tensor(self.labels[idx], dtype=torch.int64)
return session, label
def update_confusion_matrix(confusion_matrix, actual_lb, predict_lb):
for idx, value in enumerate(actual_lb):
p_value = predict_lb[idx]
confusion_matrix[value, p_value] += 1
return confusion_matrix
def truncate(f, n):
trunc_f = np.math.floor(f * 10 ** n) / 10 ** n
return '{:.2f}'.format(trunc_f)
def binarize(x, sz=256):
return torch.nn.functional.one_hot(x, sz).float()
class OneHotEncodingLayer(nn.Module):
def __init__(self, sz=256):
super(OneHotEncodingLayer, self).__init__()
self.size = sz
def forward(self, x):
return torch.nn.functional.one_hot(x, num_classes=self.size).float()
"""
# 可以改为 FlowDataLoader 中数据
def mini_batch_generator(dataset, batch_size):
data_loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
while True:
for sessions, labels in data_loader:
Xbatch = torch.ones((batch_size, PACKET_NUM_PER_SESSION, PACKET_LEN), dtype=torch.int64) * -1
Ybatch = torch.ones((batch_size, 5), dtype=torch.int64) * -1
for batch_idx, (session, label) in enumerate(zip(sessions, labels)):
for i, packet in enumerate(session):
if i < PACKET_NUM_PER_SESSION:
for j, byte in enumerate(packet[:PACKET_LEN]):
Xbatch[batch_idx, i, (PACKET_LEN - 1 - j)] = byte
Ybatch[batch_idx] = torch.nn.functional.one_hot(label, num_classes=5)[0]
yield Xbatch, Ybatch
"""
class MyDataset(data.Dataset):
"""
数据转换编码(独热编码)
"""
def __init__(self, sessions, labels, indices):
self.sessions = sessions
self.labels = labels
self.indices = indices
self.packet_num_per_session = PACKET_NUM_PER_SESSION
self.packet_len = PACKET_LEN
def __getitem__(self, index):
idx = self.indices[index]
X = torch.ones((self.packet_num_per_session, self.packet_len), dtype=torch.int64) * -1
for i, packet in enumerate(self.sessions[idx]):
if i < self.packet_num_per_session:
for j, byte in enumerate(packet[:self.packet_len]):
X[i, (self.packet_len - 1 - j)] = byte
# label = self.labels[idx].astype(np.int64) # CrossEntropyLoss 会自动把标签转换成onehot形式
# y = torch.nn.functional.one_hot(torch.from_numpy(label), num_classes=5)[0]
y = self.labels[idx][0].astype(np.int64)
return X, y
def __len__(self):
return len(self.indices)
class MyModel(nn.Module):
"""
模型定义
"""
def __init__(self, flow_len, packet_len, gru_units):
super(MyModel, self).__init__()
self.packet_len = packet_len
self.flow_len = flow_len
self.batch_size = 10
self.gru_hidden_size = gru_units
self.embedding = OneHotEncodingLayer(sz=256) # 独热编码
self.block2 = ByteBlock(self.packet_len, (128, 256), (5, 5), (1, 1), (2, 2))
self.block3 = ByteBlock(self.packet_len, (192, 320), (7, 5), (1, 1), (2, 2))
self.lstm_layer = nn.GRU(256, self.gru_hidden_size, dropout=0.1, bidirectional=True)
self.dense_layer = nn.Linear(self.gru_hidden_size * 2, 5) # # 其实像特征聚类,输出聚类数
# self.output = nn.Softmax(dim=1)
def forward(self, x): # x: [batch_size, flow_len, packet_len] = [10, 14, 100]
embeddings_list = self.embedding(x) # [10, 14, 100, 256]
encoder_list = torch.zeros((self.batch_size, self.flow_len, 256))
for ix, embeddings in enumerate(embeddings_list): # [14, 100, 256]
# print("embeddings 1: ", embeddings.shape)
# embeddings = embeddings.permute(0, 2, 1)
# print("embeddings 2: ", embeddings.shape)
encoder1 = self.block2(embeddings) # [14, 128]
# print("encoder 1: ", encoder1.shape)
encoder2 = self.block3(embeddings) # [14, 128]
# print("encoder 2: ", encoder2.shape)
encoder = torch.cat([encoder1, encoder2], 1) # [14, 256]
# print("encoder : ", encoder.shape)
encoder_list[ix] = encoder
# rnn 网络输入:[seq_len, batch_size, hidden_size]
encoder_list = encoder_list.permute(1, 0, 2) # [10, 14, 256] -> [14, 10 ,256]
biLSTM, final_hidden_state = self.lstm_layer(encoder_list) # [14, 10, 184], [2, 10, 92]
biLSTM = biLSTM.permute(1, 0, 2) # [10, 14, 184]
# print("biLSTM: ", biLSTM.shape)
attn_output, attention = self.attention_net(biLSTM, final_hidden_state)
dense = self.dense_layer(attn_output)
# out = self.output(dense)
# out = dense[:, -1, :]
return dense
# lstm_output : [batch_size, n_step, self.byte_hidden_size * num_directions(=2)], F matrix
def attention_net(self, lstm_output, final_state):
# print("lstm_output: ", lstm_output.shape)
# print("final_state: ", final_state.shape)
# hidden : [batch_size, self.byte_hidden_size * num_directions(=2), 1(=n_layer)]
hidden = final_state.view(-1, self.gru_hidden_size * 2, 1) # Tensor维度的重构-1表示该维度取决于其他维度
# attn_weights : [batch_size, n_step]
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # 加权求和,第三维降维
soft_attn_weights = torch.nn.functional.softmax(attn_weights, 1)
# [batch_size, self.byte_hidden_size * num_directions(=2), n_step] * [batch_size, n_step, 1]
# = [batch_size, self.byte_hidden_size * num_directions(=2), 1]
context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return context, soft_attn_weights # context : [batch_size, self.byte_hidden_size * num_directions(=2)]
def save_result(cf_ma, len_test, epochs=8):
"""
计算metrics_list保存测试/训练结果
:param cf_ma:
:param len_test:
:param epochs:
:return:
"""
metrics_list = []
for i in range(5):
if i == 0:
metrics_list.append(
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
str(cf_ma[i, 4]), '--', '--', '--'])
else:
acc = truncate((float(len_test - cf_ma[:, i].sum() - cf_ma[i, :].sum() + cf_ma[i, i] * 2) / len_test) * 100,
2)
tpr = truncate((float(cf_ma[i, i]) / cf_ma[i].sum()) * 100, 2)
fpr = truncate((float(cf_ma[0, i]) / cf_ma[0].sum()) * 100, 2)
metrics_list.append(
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
str(cf_ma[i, 4]), str(acc), str(tpr), str(fpr)])
overall_acc = truncate(
(float(cf_ma[0, 0] + cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / len_test) * 100, 2)
overall_tpr = truncate((float(cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / cf_ma[1:].sum()) * 100, 2)
overall_fpr = truncate((float(cf_ma[0, 1:].sum()) / cf_ma[0, :].sum()) * 100, 2)
with open('iscx12_cnn_rnn_5class_attn.txt', 'a') as f:
f.write("\n")
t = time.strftime('%Y-%m-%d %X', time.localtime())
f.write(t + "\n")
f.write('CLASS_NUM: 5\n')
f.write('PACKET_LEN: ' + str(PACKET_LEN) + "\n")
f.write('PACKET_NUM_PER_SESSION: ' + str(PACKET_NUM_PER_SESSION) + "\n")
f.write('MINI_BATCH: ' + str(MINI_BATCH) + "\n")
f.write('TRAIN_EPOCHS: ' + str(epochs) + "\n")
f.write('DATA_DIR: ' + DATA_DIR + "\n")
f.write("label\tindex\t0\t1\t2\t3\t4\tACC\tTPR\tFPR\n")
for metrics in metrics_list:
f.write('\t'.join(metrics) + "\n")
f.write('Overall accuracy: ' + str(overall_acc) + "\n")
f.write('Overall TPR: ' + str(overall_tpr) + "\n")
f.write('Overall FPR: ' + str(overall_fpr) + "\n")
# f.write('Train time(second): ' + str(int(train_time)) + "\n")
# f.write('Test time(second): ' + str(int(test_time)) + "\n\n")
f.write("\n\n")
def train(model, dataloader, criterion, optimizer, device):
model.train()
running_loss = 0.0
num = 0
for inputs, labels in dataloader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad() # 梯度清零
outputs = model(inputs)
num += 1
# print(“ {}/{} outputs & label shape: ".format(num,len(dataloader.data_loader)), outputs.shape, labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
return running_loss / len(dataloader.dataset)
def evaluate(model, dataloader, criterion, device):
model.eval()
running_loss = 0.0
cf_ma = np.zeros((5, 5), dtype=int)
num = 0
with torch.no_grad():
for inputs, labels in dataloader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item() * inputs.size(0)
predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
true_labels = labels.cpu().numpy()
cf_ma = update_confusion_matrix(cf_ma, true_labels, predicted_labels)
num += predicted_labels.shape[0]
print("num len: ", num)
save_result(cf_ma, num)
return running_loss / len(dataloader.dataset), cf_ma
def load_data():
"""
加载数据
:return:
"""
t1 = timer()
sessions = []
labels = []
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
for i in range(num_pkls):
# if i != 1:
# continue
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
sessions.extend(session_lists.values.tolist()) # 追加元素
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
label_lists = pickle.load(open(label_pkl, 'rb'))
labels.extend(label_lists.values.tolist())
print(i)
t2 = timer()
print("load data tims: ", t2 - t1)
labels = np.array(labels)
normal_indices = np.where(labels == 0)[0] # 结果所在的 行是个array
# 数据量太大不好训练。以下注释代码可以选择100000条正常流量进入训练建议在数据预处理阶段选择一定数量的正常数据——节约内存开支
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行,是个 list
# np.random.choice 会重复抽样, 若想不重复增加参数replace=False
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
test_attack_indices = np.concatenate( # 模态融合
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
return sessions, labels, train_indices, test_indices
# 定义超参数
NUM_EPOCHS = 8
LEARNING_RATE = 0.001
# 创建模型实例
model = MyModel(PACKET_NUM_PER_SESSION, PACKET_LEN, LSTM_UNITS)
# model = PLA_Attention_Model(100, 100, 50, 100, 14)
# 将模型移到GPU上如果可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# 创建训练集和验证集的数据加载器
sessions, labels, train_indices, test_indices = load_data()
train_dataset = MyDataset(sessions, labels, train_indices) # # num_workers=2Windows不允许多个进程加载数据
train_dataloader = data.DataLoader(train_dataset, batch_size=MINI_BATCH, shuffle=True, drop_last=True)
val_dataset = MyDataset(sessions, labels, test_indices) # 丢弃不成 batch 数据,否则会报错
val_dataloader = data.DataLoader(val_dataset, batch_size=MINI_BATCH, shuffle=False, drop_last=True)
# 开始训练
start_time = timer()
for epoch in range(NUM_EPOCHS):
# 训练模型
train_loss = train(model, train_dataloader, criterion, optimizer, device)
# print(f"Train Loss: {truncate(train_loss, 4)}")
# 在验证集上评估模型
val_loss, confusion_matrix = evaluate(model, val_dataloader, criterion, device)
# 输出训练和验证的损失以及混淆矩阵
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
print(f"Train Loss: {truncate(train_loss, 4)}")
print(f"Validation Loss: {truncate(val_loss, 4)}")
print("Confusion Matrix:")
for i in range(5):
for j in range(5):
print(confusion_matrix[i][j], end="\t")
print()
print("---------------------------------")
end_time = timer()
training_time = end_time - start_time
print(f"Training Time: {truncate(training_time, 2)} seconds")
"""
Epoch 7/10
Train Loss: 0.35
Validation Loss: 0.35
Confusion Matrix:
39794 43 45 20 98
472 2272 1 0 0
3832 1 31 3 0
1166 0 0 136 0
56 0 1 0 8219
---------------------------------
Train Loss: 0.35
"""