423 lines
16 KiB
Python
423 lines
16 KiB
Python
|
||
# 主网络
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.optim as optim
|
||
import torch.utils.data as data
|
||
from torch.nn.functional import normalize
|
||
import numpy as np
|
||
import pickle
|
||
import glob
|
||
from timeit import default_timer as timer
|
||
import time
|
||
import warnings
|
||
|
||
# 用 BiGRU 网络试试
|
||
from model import *
|
||
|
||
# 不输出“Warning”信息
|
||
# warnings.filterwarnings("ignore")
|
||
"""
|
||
出现过错误的地方,反映了基础不牢
|
||
交叉熵:https://blog.csdn.net/dss_dssssd/article/details/84036913
|
||
label 转 one-hot:https://blog.csdn.net/qq_34914551/article/details/88700334
|
||
|
||
实验自适应据类
|
||
"""
|
||
|
||
LSTM_UNITS = 92
|
||
MINI_BATCH = 10
|
||
TRAIN_STEPS_PER_EPOCH = 12000
|
||
VALIDATION_STEPS_PER_EPOCH = 800
|
||
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
|
||
CHECKPOINTS_DIR = './checkpoints/'
|
||
|
||
dict_5class = {0: 'Normal', 1: 'BFSSH', 2: 'Infilt', 3: 'HttpDoS', 4: 'DDoS'}
|
||
class_num = 5
|
||
|
||
PACKET_NUM_PER_SESSION = 14
|
||
PACKET_LEN = 100
|
||
|
||
|
||
class ByteBlock(nn.Module):
|
||
"""
|
||
卷积神经网络
|
||
"""
|
||
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
|
||
subsample=(2, 1), pool_length=(2, 2)):
|
||
super(ByteBlock, self).__init__()
|
||
|
||
layers = []
|
||
for i in range(len(nb_filter)):
|
||
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
|
||
padding=0, stride=subsample[i]))
|
||
layers.append(nn.Tanh())
|
||
if pool_length[i]:
|
||
layers.append(nn.MaxPool1d(pool_length[i]))
|
||
in_channels = nb_filter[i]
|
||
|
||
self.block = nn.Sequential(*layers)
|
||
self.global_pool = nn.AdaptiveMaxPool1d(1)
|
||
self.fc = nn.Linear(nb_filter[-1], 128)
|
||
|
||
def forward(self, x):
|
||
x = self.block(x)
|
||
x = self.global_pool(x).squeeze(dim=2)
|
||
x = torch.nn.functional.relu(self.fc(x))
|
||
return x
|
||
|
||
|
||
def update_confusion_matrix(confusion_matrix, actual_lb, predict_lb):
|
||
for idx, value in enumerate(actual_lb):
|
||
p_value = predict_lb[idx]
|
||
confusion_matrix[value, p_value] += 1
|
||
return confusion_matrix
|
||
|
||
|
||
def truncate(f, n):
|
||
trunc_f = np.math.floor(f * 10 ** n) / 10 ** n
|
||
return '{:.2f}'.format(trunc_f)
|
||
|
||
|
||
def binarize(x, sz=256):
|
||
return torch.nn.functional.one_hot(x, sz).float()
|
||
|
||
|
||
class OneHotEncodingLayer(nn.Module):
|
||
def __init__(self, sz=256):
|
||
super(OneHotEncodingLayer, self).__init__()
|
||
self.size = sz
|
||
|
||
def forward(self, x):
|
||
return torch.nn.functional.one_hot(x, num_classes=self.size).float()
|
||
|
||
|
||
class MyDataset(data.Dataset):
|
||
"""
|
||
数据转换编码(独热编码)
|
||
"""
|
||
def __init__(self, sessions, labels, indices):
|
||
self.sessions = sessions
|
||
self.labels = labels
|
||
self.indices = indices
|
||
self.packet_num_per_session = PACKET_NUM_PER_SESSION
|
||
self.packet_len = PACKET_LEN
|
||
|
||
def __getitem__(self, index):
|
||
idx = self.indices[index]
|
||
X = torch.ones((self.packet_num_per_session, self.packet_len), dtype=torch.int64) * -1
|
||
|
||
for i, packet in enumerate(self.sessions[idx]):
|
||
if i < self.packet_num_per_session: # 每个数据会话留只取前 num 个
|
||
for j, byte in enumerate(packet[:self.packet_len]): # 每个数据包取前 PACKET_LEN 字节
|
||
X[i, (self.packet_len - 1 - j)] = byte
|
||
|
||
# label = self.labels[idx].astype(np.int64) # CrossEntropyLoss 会自动把标签转换成onehot形式
|
||
# y = torch.nn.functional.one_hot(torch.from_numpy(label), num_classes=5)[0]
|
||
y = self.labels[idx][0].astype(np.int64)
|
||
return X, y
|
||
|
||
def __len__(self):
|
||
return len(self.indices)
|
||
|
||
|
||
class FEMnet(nn.Module):
|
||
"""
|
||
Feature extraction module 定义
|
||
"""
|
||
def __init__(self, flow_len, packet_len, gru_units):
|
||
super(FEMnet, self).__init__()
|
||
self.packet_len = packet_len
|
||
self.flow_len = flow_len
|
||
self.batch_size = 10
|
||
self.gru_hidden_size = gru_units
|
||
self.rep_dim = gru_units
|
||
|
||
self.embedding = OneHotEncodingLayer(sz=256) # 独热编码
|
||
self.block2 = ByteBlock(self.packet_len, (128, 256), (5, 5), (1, 1), (2, 2))
|
||
self.block3 = ByteBlock(self.packet_len, (192, 320), (7, 5), (1, 1), (2, 2))
|
||
|
||
self.lstm_layer = nn.GRU(256, self.gru_hidden_size, dropout=0.1, bidirectional=True)
|
||
# self.dense_layer = nn.Linear(self.gru_hidden_size * 2, 5)
|
||
# self.output = nn.Softmax(dim=1) # Linear 自带Softmax 分类
|
||
|
||
def forward(self, x): # x: [batch_size, flow_len, packet_len] = [10, 14, 100]
|
||
embeddings_list = self.embedding(x) # [10, 14, 100, 256]
|
||
encoder_list = torch.zeros((self.batch_size, self.flow_len, 256))
|
||
for ix, embeddings in enumerate(embeddings_list): # [14, 100, 256]
|
||
# print("embeddings 1: ", embeddings.shape)
|
||
# embeddings = embeddings.permute(0, 2, 1)
|
||
# print("embeddings 2: ", embeddings.shape)
|
||
encoder1 = self.block2(embeddings) # [14, 128]
|
||
# print("encoder 1: ", encoder1.shape)
|
||
encoder2 = self.block3(embeddings) # [14, 128]
|
||
# print("encoder 2: ", encoder2.shape)
|
||
encoder = torch.cat([encoder1, encoder2], 1) # [14, 256]
|
||
# print("encoder : ", encoder.shape)
|
||
encoder_list[ix] = encoder
|
||
|
||
# rnn 网络输入:[seq_len, batch_size, hidden_size]
|
||
encoder_list = encoder_list.permute(1, 0, 2) # [10, 14, 256] -> [14, 10 ,256]
|
||
biLSTM, final_hidden_state = self.lstm_layer(encoder_list) # [14, 10, 184], [2, 10, 92]
|
||
biLSTM = biLSTM.permute(1, 0, 2) # [10, 14, 184]
|
||
# print("biLSTM: ", biLSTM.shape)
|
||
attn_output, attention = self.attention_net(biLSTM, final_hidden_state)
|
||
dense = self.dense_layer(attn_output)
|
||
# out = self.output(dense)
|
||
# out = dense[:, -1, :]
|
||
return dense
|
||
|
||
# lstm_output : [batch_size, n_step, self.byte_hidden_size * num_directions(=2)], F matrix
|
||
def attention_net(self, lstm_output, final_state):
|
||
# print("lstm_output: ", lstm_output.shape)
|
||
# print("final_state: ", final_state.shape)
|
||
# hidden : [batch_size, self.byte_hidden_size * num_directions(=2), 1(=n_layer)]
|
||
hidden = final_state.view(-1, self.gru_hidden_size * 2, 1) # Tensor维度的重构,-1表示该维度取决于其他维度
|
||
# attn_weights : [batch_size, n_step]
|
||
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # 加权求和,第三维降维
|
||
soft_attn_weights = torch.nn.functional.softmax(attn_weights, 1)
|
||
# [batch_size, self.byte_hidden_size * num_directions(=2), n_step] * [batch_size, n_step, 1]
|
||
# = [batch_size, self.byte_hidden_size * num_directions(=2), 1]
|
||
context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
|
||
return context, soft_attn_weights # context : [batch_size, self.byte_hidden_size * num_directions(=2)]
|
||
|
||
|
||
class MyModel(nn.Module):
|
||
"""
|
||
Feature extraction module定义
|
||
"""
|
||
def __init__(self, binet, feature_dim, class_n):
|
||
super(MyModel, self).__init__()
|
||
self.binet = binet
|
||
self.feature_dim = feature_dim
|
||
self.cluster_num = class_n
|
||
""" 对比聚类 """
|
||
self.instance_projector = nn.Sequential( # 实例-单例
|
||
nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
|
||
nn.ReLU(),
|
||
nn.Linear(self.binet.rep_dim, self.feature_dim),
|
||
)
|
||
self.cluster_projector = nn.Sequential( # 聚类-集群
|
||
nn.Linear(self.binet.rep_dim, self.binet.rep_dim),
|
||
nn.ReLU(),
|
||
nn.Linear(self.binet.rep_dim, self.cluster_num),
|
||
nn.Softmax(dim=1)
|
||
)
|
||
|
||
# 训练时执行
|
||
def forward(self, x_i, x_j): # x_i, x_j 来自同一数据的两种不同预处理方式得到的嵌入矩阵
|
||
h_i = self.resnet(x_i)
|
||
h_j = self.resnet(x_j)
|
||
|
||
z_i = normalize(self.instance_projector(h_i), dim=1)
|
||
z_j = normalize(self.instance_projector(h_j), dim=1)
|
||
|
||
c_i = self.cluster_projector(h_i)
|
||
c_j = self.cluster_projector(h_j)
|
||
|
||
return z_i, z_j, c_i, c_j
|
||
|
||
# 测试时执行
|
||
def forward_cluster(self, x):
|
||
h = self.resnet(x)
|
||
c = self.cluster_projector(h) # 聚类分布
|
||
c = torch.argmax(c, dim=1) # 得到每个样本的聚类标签。
|
||
return c
|
||
|
||
|
||
def save_result(cf_ma, len_test, epochs=8):
|
||
"""
|
||
计算 metrics_list,保存测试/训练结果
|
||
:param cf_ma:
|
||
:param len_test:
|
||
:param epochs:
|
||
:return:
|
||
"""
|
||
metrics_list = []
|
||
for i in range(5):
|
||
if i == 0:
|
||
metrics_list.append(
|
||
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
|
||
str(cf_ma[i, 4]), '--', '--', '--'])
|
||
else:
|
||
acc = truncate((float(len_test - cf_ma[:, i].sum() - cf_ma[i, :].sum() + cf_ma[i, i] * 2) / len_test) * 100,
|
||
2)
|
||
tpr = truncate((float(cf_ma[i, i]) / cf_ma[i].sum()) * 100, 2)
|
||
fpr = truncate((float(cf_ma[0, i]) / cf_ma[0].sum()) * 100, 2)
|
||
metrics_list.append(
|
||
[dict_5class[i], str(i), str(cf_ma[i, 0]), str(cf_ma[i, 1]), str(cf_ma[i, 2]), str(cf_ma[i, 3]),
|
||
str(cf_ma[i, 4]), str(acc), str(tpr), str(fpr)])
|
||
overall_acc = truncate(
|
||
(float(cf_ma[0, 0] + cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / len_test) * 100, 2)
|
||
overall_tpr = truncate((float(cf_ma[1, 1] + cf_ma[2, 2] + cf_ma[3, 3] + cf_ma[4, 4]) / cf_ma[1:].sum()) * 100, 2)
|
||
overall_fpr = truncate((float(cf_ma[0, 1:].sum()) / cf_ma[0, :].sum()) * 100, 2)
|
||
|
||
# 获取当前日期
|
||
current_date = time.strftime("%Y%m%d", time.localtime())
|
||
# 构造文件名
|
||
file_name = f"evaluate_{current_date}.txt"
|
||
|
||
with open(file_name, 'a') as f:
|
||
f.write("\n")
|
||
t = time.strftime('%Y-%m-%d %X', time.localtime())
|
||
f.write(t + "\n")
|
||
f.write('CLASS_NUM: 5\n')
|
||
f.write('PACKET_LEN: ' + str(PACKET_LEN) + "\n")
|
||
f.write('PACKET_NUM_PER_SESSION: ' + str(PACKET_NUM_PER_SESSION) + "\n")
|
||
f.write('MINI_BATCH: ' + str(MINI_BATCH) + "\n")
|
||
f.write('TRAIN_EPOCHS: ' + str(epochs) + "\n")
|
||
f.write('DATA_DIR: ' + DATA_DIR + "\n")
|
||
f.write("label\tindex\t0\t1\t2\t3\t4\tACC\tTPR\tFPR\n")
|
||
for metrics in metrics_list:
|
||
f.write('\t'.join(metrics) + "\n")
|
||
f.write('Overall accuracy: ' + str(overall_acc) + "\n")
|
||
f.write('Overall TPR: ' + str(overall_tpr) + "\n")
|
||
f.write('Overall FPR: ' + str(overall_fpr) + "\n")
|
||
# f.write('Train time(second): ' + str(int(train_time)) + "\n")
|
||
# f.write('Test time(second): ' + str(int(test_time)) + "\n\n")
|
||
f.write("\n\n")
|
||
|
||
|
||
def train(model, dataloader, criterion, optimizer, device):
|
||
model.train()
|
||
running_loss = 0.0
|
||
num = 0
|
||
for inputs, labels in dataloader:
|
||
inputs = inputs.to(device)
|
||
labels = labels.to(device)
|
||
|
||
optimizer.zero_grad() # 梯度清零
|
||
|
||
outputs = model(inputs)
|
||
num += 1
|
||
# print(“ {}/{} outputs & label shape: ".format(num,len(dataloader.data_loader)),
|
||
# outputs.shape, labels.shape)
|
||
loss = criterion(outputs, labels)
|
||
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
running_loss += loss.item() * inputs.size(0)
|
||
|
||
return running_loss / len(dataloader.dataset)
|
||
|
||
|
||
def evaluate(model, dataloader, criterion, device):
|
||
model.eval()
|
||
running_loss = 0.0
|
||
cf_ma = np.zeros((5, 5), dtype=int)
|
||
num = 0
|
||
with torch.no_grad():
|
||
for inputs, labels in dataloader:
|
||
inputs = inputs.to(device)
|
||
labels = labels.to(device)
|
||
|
||
outputs = model(inputs)
|
||
loss = criterion(outputs, labels)
|
||
|
||
running_loss += loss.item() * inputs.size(0)
|
||
predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
|
||
true_labels = labels.cpu().numpy()
|
||
|
||
cf_ma = update_confusion_matrix(cf_ma, true_labels, predicted_labels)
|
||
num += predicted_labels.shape[0]
|
||
|
||
print("num len: ", num)
|
||
save_result(cf_ma, num)
|
||
return running_loss / len(dataloader.dataset), cf_ma
|
||
|
||
|
||
def load_data():
|
||
"""
|
||
加载数据
|
||
:return:
|
||
"""
|
||
t1 = timer()
|
||
sessions = []
|
||
labels = []
|
||
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
|
||
for i in range(num_pkls):
|
||
# if i != 1:
|
||
# continue
|
||
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
|
||
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
|
||
sessions.extend(session_lists.values.tolist()) # 追加元素
|
||
|
||
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
|
||
label_lists = pickle.load(open(label_pkl, 'rb'))
|
||
labels.extend(label_lists.values.tolist())
|
||
print(i)
|
||
t2 = timer()
|
||
print("load data tims: ", t2 - t1)
|
||
|
||
labels = np.array(labels)
|
||
normal_indices = np.where(labels == 0)[0] # 结果所在的 行,是个array
|
||
# 数据量太大,不好训练。以下注释代码可以选择100000条正常流量进入训练(建议在数据预处理阶段选择一定数量的正常数据——节约内存开支)
|
||
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
|
||
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行,是个 list
|
||
# np.random.choice 会重复抽样, 若想不重复,增加参数:replace=False
|
||
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
|
||
test_attack_indices = np.concatenate( # 模态融合
|
||
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
|
||
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
|
||
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
|
||
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
|
||
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
|
||
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
|
||
|
||
return sessions, labels, train_indices, test_indices
|
||
|
||
|
||
# 定义超参数
|
||
NUM_EPOCHS = 8
|
||
LEARNING_RATE = 0.001
|
||
|
||
# 创建模型实例
|
||
fe_model = FEMnet(PACKET_NUM_PER_SESSION, PACKET_LEN, LSTM_UNITS)
|
||
model = MyModel(fe_model, 128, class_num)
|
||
# model = PLA_Attention_Model(100, 100, 50, 100, 14)
|
||
|
||
# 将模型移到GPU上(如果可用)
|
||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||
model.to(device)
|
||
|
||
# 定义损失函数和优化器
|
||
criterion = nn.CrossEntropyLoss()
|
||
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
|
||
|
||
# 创建训练集和验证集的数据加载器
|
||
sessions, labels, train_indices, test_indices = load_data()
|
||
|
||
train_dataset = MyDataset(sessions, labels, train_indices) # # num_workers=2,Windows不允许多个进程加载数据
|
||
train_dataloader = data.DataLoader(train_dataset, batch_size=MINI_BATCH, shuffle=True, drop_last=True)
|
||
|
||
val_dataset = MyDataset(sessions, labels, test_indices) # 丢弃不成 batch 数据,否则会报错
|
||
val_dataloader = data.DataLoader(val_dataset, batch_size=MINI_BATCH, shuffle=False, drop_last=True)
|
||
|
||
# 开始训练
|
||
start_time = timer()
|
||
|
||
for epoch in range(NUM_EPOCHS):
|
||
# 训练模型
|
||
train_loss = train(model, train_dataloader, criterion, optimizer, device)
|
||
# print(f"Train Loss: {truncate(train_loss, 4)}")
|
||
# 在验证集上评估模型
|
||
val_loss, confusion_matrix = evaluate(model, val_dataloader, criterion, device)
|
||
|
||
# 输出训练和验证的损失以及混淆矩阵
|
||
print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
|
||
print(f"Train Loss: {truncate(train_loss, 4)}")
|
||
print(f"Validation Loss: {truncate(val_loss, 4)}")
|
||
print("Confusion Matrix:")
|
||
for i in range(5):
|
||
for j in range(5):
|
||
print(confusion_matrix[i][j], end="\t")
|
||
print()
|
||
print("---------------------------------")
|
||
|
||
end_time = timer()
|
||
training_time = end_time - start_time
|
||
|
||
print(f"Training Time: {truncate(training_time, 2)} seconds")
|
||
|