246 lines
11 KiB
Python
246 lines
11 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
|
|||
|
import torch
|
|||
|
import torch.nn as nn
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
import numpy as np
|
|||
|
import pandas as pd
|
|||
|
import os, glob, time
|
|||
|
import pickle
|
|||
|
from timeit import default_timer as timer
|
|||
|
|
|||
|
# matplot lib complains about librosa
|
|||
|
import warnings
|
|||
|
warnings.filterwarnings('ignore')
|
|||
|
|
|||
|
|
|||
|
# classes index
|
|||
|
traffic_dict ={
|
|||
|
0: 'Normal',
|
|||
|
1: 'BFSSH',
|
|||
|
2: 'Infilt',
|
|||
|
3: 'HttpDoS',
|
|||
|
4: 'DDoS'
|
|||
|
}
|
|||
|
|
|||
|
# network traffic attributes
|
|||
|
traffic_attributes = {
|
|||
|
'01': 'normal', # 正常流量
|
|||
|
'02': 'anomaly' # abnormal
|
|||
|
}
|
|||
|
|
|||
|
"""## Load Data
|
|||
|
"""
|
|||
|
|
|||
|
# path to data for glob
|
|||
|
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
|
|||
|
|
|||
|
def load_data():
|
|||
|
"""
|
|||
|
加载数据
|
|||
|
:return:
|
|||
|
"""
|
|||
|
t1 = timer()
|
|||
|
sessions = []
|
|||
|
labels = []
|
|||
|
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
|
|||
|
for i in range(num_pkls):
|
|||
|
# if i != 1:
|
|||
|
# continue
|
|||
|
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
|
|||
|
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
|
|||
|
sessions.extend(session_lists.values.tolist()) # 追加元素
|
|||
|
|
|||
|
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
|
|||
|
label_lists = pickle.load(open(label_pkl, 'rb'))
|
|||
|
labels.extend(label_lists.values.tolist())
|
|||
|
print(i)
|
|||
|
t2 = timer()
|
|||
|
print("load data tims: ", t2 - t1)
|
|||
|
|
|||
|
labels = np.array(labels)
|
|||
|
normal_indices = np.where(labels == 0)[0] # 结果所在的 行, 是个array
|
|||
|
# 数据量太大, 不好训练。以下注释代码可以选择100000条正常流量进入训练(建议在数据预处理阶段选择一定数量的正常数据——节约内存开支)
|
|||
|
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
|
|||
|
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行, 是个 list
|
|||
|
# np.random.choice 会重复抽样, 若想不重复, 增加参数:replace=False
|
|||
|
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
|
|||
|
test_attack_indices = np.concatenate( # 模态融合
|
|||
|
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
|
|||
|
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
|
|||
|
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
|
|||
|
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
|
|||
|
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
|
|||
|
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
|
|||
|
|
|||
|
return sessions, labels, train_indices, test_indices
|
|||
|
|
|||
|
|
|||
|
|
|||
|
"""# Architecture Overview
|
|||
|
|
|||
|
# CNN Motivation
|
|||
|
** 构建两个并行卷积神经网络(CNN)来对流量数据进行空间特征表示
|
|||
|
|
|||
|
# Transformer-Encoder Motivation
|
|||
|
**使用了Transformer-Encoder层
|
|||
|
**I maxpool 映射到Transformer, 以大大减少网络需要学习的参数数量
|
|||
|
"""
|
|||
|
|
|||
|
|
|||
|
class ByteBlock(nn.Module):
|
|||
|
"""
|
|||
|
1D FCN: 1维全卷积神经网络
|
|||
|
|
|||
|
in_channels:输入通道数, 在一维卷积中由于不存在通道数, 因此in_channels的数值为词向量的维度, 如果一个单词用128维向量表示, 那么in_channels = 128
|
|||
|
|
|||
|
out_channels:输出通道数, 表示经过卷积之后, 一个词向量嵌入维度应该为多少。如果out_channels = 64, 那么经过本次卷积之后的每个词的嵌入维度为64。
|
|||
|
|
|||
|
kernel_size:卷积核大小, 表示本次卷积核的维度, 一般是赋值为int类型。kernel_size=3, 表示每次卷积计算操作涉及到3个词, 也就是卷积核维度被设为(in_channels, kernel_size)。
|
|||
|
- 在Pytorch中, 对于一条语句序列数据的每个词都是用一个列向量表示
|
|||
|
|
|||
|
stride:滑动步长, 表示在卷积方向上滑动的步长。stride=2, 表示在当前卷积的范围为123, 下一个卷积范围就是345。
|
|||
|
|
|||
|
padding:填补操作, 表示在对特征矩阵剩余部分不足卷积时的操作。
|
|||
|
- str --> padding =“valid”:表示不填充, 剩余部分丢弃。 padding =“same”:表示在右侧填充之后要求输入输出序列长度一致
|
|||
|
- int --> padding = k: 表示在右侧填充k列
|
|||
|
"""
|
|||
|
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
|
|||
|
subsample=(2, 1), pool_length=(2, 2)):
|
|||
|
super(ByteBlock, self).__init__()
|
|||
|
|
|||
|
layers = []
|
|||
|
for i in range(len(nb_filter)):
|
|||
|
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
|
|||
|
padding=0, stride=subsample[i]))
|
|||
|
layers.append(nn.Tanh())
|
|||
|
if pool_length[i]:
|
|||
|
layers.append(nn.MaxPool1d(pool_length[i]))
|
|||
|
in_channels = nb_filter[i]
|
|||
|
|
|||
|
self.block = nn.Sequential(*layers)
|
|||
|
self.global_pool = nn.AdaptiveMaxPool1d(1)
|
|||
|
|
|||
|
|
|||
|
def forward(self, x):
|
|||
|
x = self.block(x)
|
|||
|
x = self.global_pool(x).squeeze(dim=2)
|
|||
|
x = torch.nn.functional.leaky_relu(x)
|
|||
|
return x
|
|||
|
|
|||
|
class FCN_Transformer(nn.Module):
|
|||
|
# Define all layers present in the network
|
|||
|
def __init__(self,num_emotions):
|
|||
|
super().__init__()
|
|||
|
|
|||
|
################ TRANSFORMER BLOCK #############################
|
|||
|
# maxpool the input feature map/tensor to the transformer
|
|||
|
# a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
|
|||
|
self.transformer_maxpool = nn.MaxPool1d(2)
|
|||
|
|
|||
|
# define single transformer encoder layer
|
|||
|
# self-attention + feedforward network from "Attention is All You Need" paper
|
|||
|
# Input size: sequence length, batch size, feature size = 128
|
|||
|
transformer_layer = nn.TransformerEncoderLayer(
|
|||
|
d_model=128, # input feature (frequency) dim after maxpooling 128*y -> 64*140 (freq*time) 输入特征维度
|
|||
|
nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block 注意力头数
|
|||
|
dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64 前馈神经网络中隐藏层的维度。
|
|||
|
dropout=0.4,
|
|||
|
activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
|
|||
|
)
|
|||
|
|
|||
|
# Using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
|
|||
|
# Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
|
|||
|
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
|
|||
|
|
|||
|
############### 1ST PARALLEL 1D CONVOLUTION BLOCK ############
|
|||
|
# 1 sequential conv1D layers: (1,128,282) --> (x, y, z)
|
|||
|
self.conv1Dblock1 = ByteBlock(128, (128, 256), (5, 5), (1, 1), (2, 2))
|
|||
|
############### 2ND PARALLEL 1D CONVOLUTION BLOCK ############
|
|||
|
# 1 sequential conv1D layers: (1,128,282) --> (x, y, z)
|
|||
|
self.conv1Dblock2 = ByteBlock(128, (192, 320), (7, 5), (1, 1), (2, 2))
|
|||
|
################# FINAL LINEAR BLOCK ####################
|
|||
|
# Linear softmax layer to take final concatenated embedding tensor
|
|||
|
# from parallel 1D convolutional and transformer blocks, output classes logits
|
|||
|
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
|
|||
|
|
|||
|
### Softmax layer for the 8 output logits from final FC linear layer
|
|||
|
self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
|
|||
|
|
|||
|
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
|
|||
|
def forward(self,x):
|
|||
|
|
|||
|
############ 1st parallel Conv1D block: 4 Convolutional layers ############################
|
|||
|
# create final feature embedding from 1st convolutional layer
|
|||
|
# input features pased through 4 sequential 1D convolutional layers
|
|||
|
print("x: ",x.type())
|
|||
|
conv1d_embedding1 = self.conv1Dblock1(x) # x == N/batch * channel * freq * time
|
|||
|
|
|||
|
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
|
|||
|
# skip the 1st (N/batch) dimension when flattening
|
|||
|
conv1d_embedding1 = torch.flatten(conv1d_embedding1, start_dim=1)
|
|||
|
|
|||
|
############ 2nd parallel Conv1D block: 4 Convolutional layers #############################
|
|||
|
# create final feature embedding from 2nd convolutional layer
|
|||
|
# input features pased through 4 sequential 1D convolutional layers
|
|||
|
conv1d_embedding2 = self.conv1Dblock2(x) # x == N/batch * channel * freq * time
|
|||
|
|
|||
|
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
|
|||
|
# skip the 1st (N/batch) dimension when flattening
|
|||
|
conv1d_embedding2 = torch.flatten(conv1d_embedding2, start_dim=1)
|
|||
|
|
|||
|
|
|||
|
########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ##############
|
|||
|
# maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70
|
|||
|
x_maxpool = self.transformer_maxpool(x)
|
|||
|
|
|||
|
# remove channel dim: 1*x*y --> x*y
|
|||
|
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
|
|||
|
|
|||
|
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
|
|||
|
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
|
|||
|
x = x_maxpool_reduced.permute(2,0,1)
|
|||
|
print("x_maxpool_reduced: ",x_maxpool_reduced.shape)
|
|||
|
# finally, pass reduced input feature map x into transformer encoder layers
|
|||
|
transformer_output = self.transformer_encoder(x)
|
|||
|
|
|||
|
# create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
|
|||
|
# transformer outputs 64*140 (freq embedding*time) feature map, take mean of all columns i.e. take time average
|
|||
|
transformer_embedding = torch.mean(transformer_output, dim=0) # dim
|
|||
|
|
|||
|
############# concatenate freq embeddings from convolutional and transformer blocks ######
|
|||
|
# concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
|
|||
|
complete_embedding = torch.cat([conv1d_embedding1, conv1d_embedding2,transformer_embedding], dim=1)
|
|||
|
|
|||
|
######### final FC linear layer, need logits for loss #########################
|
|||
|
output_logits = self.fc1_linear(complete_embedding)
|
|||
|
|
|||
|
######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######
|
|||
|
output_softmax = self.softmax_out(output_logits)
|
|||
|
|
|||
|
# need output logits to compute cross entropy loss, need softmax probabilities to predict class
|
|||
|
return output_logits, output_softmax
|
|||
|
|
|||
|
|
|||
|
"""# 查看模型结构
|
|||
|
"""
|
|||
|
from torchsummary import summary
|
|||
|
|
|||
|
# need device to instantiate model
|
|||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|||
|
|
|||
|
# instantiate model for 8 emotions and move to CPU for summary
|
|||
|
model = FCN_Transformer(len(traffic_dict)).to(device)
|
|||
|
|
|||
|
# print("\nmodel: \n", model,"\n")
|
|||
|
data = torch.randint(255, size=(128, 128, 100)) # batch_size, flow_len, packet_len
|
|||
|
print("data: ", data.type())
|
|||
|
model(data)
|
|||
|
|
|||
|
# include input feature map dims in call to summary()
|
|||
|
# summary(model, input_size=(128,282))
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|