first
parent
88306d79fa
commit
4a9a683850
|
@ -0,0 +1,185 @@
|
|||
"""
|
||||
Task: 基于Bi-LSTM和注意力机制的文本情感分类
|
||||
Author: ChengJunkai @github.com/Cheng0829
|
||||
Email: chengjunkai829@gmail.com
|
||||
Date: 2022/09/14
|
||||
Reference: Tae Hwan Jung(Jeff Jung) @graykode
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch, time, os, sys
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.nn.functional as F
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
'''1.数据预处理'''
|
||||
|
||||
|
||||
def pre_process(sentences):
|
||||
word_sequence = " ".join(sentences).split()
|
||||
word_list = []
|
||||
'''
|
||||
如果用list(set(word_sequence))来去重,得到的将是一个随机顺序的列表(因为set无序),
|
||||
这样得到的字典不同,保存的上一次训练的模型很有可能在这一次不能用
|
||||
(比如上一次的模型预测碰见i:0,love:1,就输出you:2,但这次模型you在字典3号位置,也就无法输出正确结果)
|
||||
'''
|
||||
for word in word_sequence:
|
||||
if word not in word_list:
|
||||
word_list.append(word)
|
||||
word_dict = {w: i for i, w in enumerate(word_list)}
|
||||
word_dict["''"] = len(word_dict)
|
||||
word_list = word_list.append("''")
|
||||
vocab_size = len(word_dict) # 词库大小16
|
||||
max_size = 0
|
||||
for sen in sentences:
|
||||
if len(sen.split()) > max_size:
|
||||
max_size = len(sen.split()) # 最大长度3
|
||||
for i in range(len(sentences)):
|
||||
if len(sentences[i].split()) < max_size:
|
||||
sentences[i] = sentences[i] + " ''" * (max_size - len(sentences[i].split()))
|
||||
|
||||
return sentences, word_list, word_dict, vocab_size, max_size
|
||||
|
||||
|
||||
def make_batch(sentences):
|
||||
# 对于每个句子,返回包含句子内每个单词序号的列表
|
||||
inputs = [np.array([word_dict[n] for n in sen.split()]) for sen in sentences] # [6,3]
|
||||
targets = [out for out in labels]
|
||||
inputs = torch.LongTensor(np.array(inputs)).to(device)
|
||||
targets = torch.LongTensor(np.array(targets)).to(device)
|
||||
'''情感分类构建嵌入矩阵,没有eye()'''
|
||||
return inputs, targets
|
||||
|
||||
|
||||
class BiLSTM_Attention(nn.Module):
|
||||
def __init__(self):
|
||||
super(BiLSTM_Attention, self).__init__()
|
||||
'''情感分类构建嵌入矩阵,没有eye()'''
|
||||
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
||||
self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
|
||||
self.out = nn.Linear(2 * n_hidden, num_classes)
|
||||
|
||||
def forward(self, X): # X: [6, 3]
|
||||
# input : [batch_size, n_step, embedding_dim] [6,3,2]
|
||||
input = self.embedding(X)
|
||||
# input : [n_step, batch_size, embedding_dim] [3,6,2]
|
||||
# input : [输入序列长度(时间步长度),样本数,嵌入向量维度]
|
||||
input = input.permute(1, 0, 2)
|
||||
# hidden_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden]
|
||||
# hidden_state : [层数*网络方向,样本数,隐藏层的维度(隐藏层神经元个数)]
|
||||
hidden_state = torch.zeros(1 * 2, len(X), n_hidden).to(device)
|
||||
# cell_state : [num_layers*num_directions, batch_size, hidden_size]
|
||||
# cell_state : [层数*网络方向,样本数,隐藏层的维度(隐藏层神经元个数)]
|
||||
cell_state = torch.zeros(1 * 2, len(X), n_hidden).to(device)
|
||||
# final_hidden_state, final_cell_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden]
|
||||
ltsm_output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
|
||||
# ltsm_output : [batch_size, n_step, n_hidden*num_directions(=2)]
|
||||
ltsm_output = ltsm_output.permute(1, 0, 2)
|
||||
attn_output, attention = self.attention_net(ltsm_output, final_hidden_state)
|
||||
# model : [batch_size, num_classes], attention : [batch_size, n_step]
|
||||
return self.out(attn_output), attention
|
||||
|
||||
'''两次bmm加权求和,相当于两次for循环'''
|
||||
|
||||
# lstm_output : [batch_size, n_step, n_hidden*num_directions(=2)] [6,3,16]
|
||||
# final_hidden_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden] [2,6,8]
|
||||
def attention_net(self, lstm_output, final_hidden_state):
|
||||
# final_hidden_state : [batch_size, n_hidden*num_directions(=2), 1(=n_layer)] [6,16,1]
|
||||
final_hidden_state = final_hidden_state.view(-1, 2 * n_hidden, 1)
|
||||
|
||||
'''第一次bmm加权求和:: lstm_output和final_hidden_state生成注意力权重attn_weights'''
|
||||
# [6,3,16]*[6,16,1] -> [6,3,1] -> attn_weights : [batch_size, n_step] [6,3]
|
||||
attn_weights = torch.bmm(lstm_output, final_hidden_state).squeeze(2) # 第3维度降维
|
||||
softmax_attn_weights = F.softmax(attn_weights, 1) # 按列求值 [6,3]
|
||||
|
||||
'''第二次bmm加权求和 : lstm_output和注意力权重attn_weights生成上下文向量context,即融合了注意力的模型输出'''
|
||||
# [batch_size, n_hidden*num_directions, n_step] * [batch_size,n_step,1] \
|
||||
# = [batch_size, n_hidden*num_directions, 1] : [6,16,3] * [6,3,1] -> [6,16,1] -> [6,16]
|
||||
context = torch.bmm(lstm_output.transpose(1, 2), softmax_attn_weights.unsqueeze(2)).squeeze(2)
|
||||
softmax_attn_weights = softmax_attn_weights.to('cpu') # numpy变量只能在cpu上
|
||||
|
||||
'''各个任务求出context之后的步骤不同,LSTM的上下文不需要和Seq2Seq中的一样和decoder_output连接'''
|
||||
return context, softmax_attn_weights.data.numpy()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
chars = 30 * '*'
|
||||
embedding_dim = 3 # embedding size
|
||||
n_hidden = 8 # number of hidden units in one cell
|
||||
num_classes = 2 # 0 or 1
|
||||
'''GPU比CPU慢的原因大致为:
|
||||
数据传输会有很大的开销,而GPU处理数据传输要比CPU慢,
|
||||
而GPU在矩阵计算上的优势在小规模神经网络中无法明显体现出来
|
||||
'''
|
||||
device = ['cuda:0' if torch.cuda.is_available() else 'cpu'][0]
|
||||
# 3 words sentences (=sequence_length is 3)
|
||||
sentences = ["i love you", "he loves me", "don't leave",
|
||||
"i hate you", "sorry for that", "this is awful"]
|
||||
labels = [1, 1, 1, 0, 0, 0] # 1 is good, 0 is not good.
|
||||
|
||||
'''1.数据预处理'''
|
||||
sentences, word_list, word_dict, vocab_size, max_size = pre_process(sentences)
|
||||
inputs, targets = make_batch(sentences)
|
||||
|
||||
'''2.构建模型'''
|
||||
model = BiLSTM_Attention()
|
||||
print(model)
|
||||
model.to(device)
|
||||
criterion = nn.CrossEntropyLoss() # 交叉熵损失
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
|
||||
if os.path.exists('model_param.pt'):
|
||||
# 加载模型参数到模型结构
|
||||
model.load_state_dict(torch.load('model_param.pt', map_location=device))
|
||||
|
||||
'''3.训练'''
|
||||
print('{}\nTrain\n{}'.format('*' * 30, '*' * 30))
|
||||
loss_record = []
|
||||
for epoch in range(10000):
|
||||
optimizer.zero_grad()
|
||||
output, attention = model(inputs)
|
||||
output = output.to(device)
|
||||
loss = criterion(output, targets)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
# print(loss)
|
||||
if loss >= 0.001: # 连续30轮loss小于0.01则提前结束训练
|
||||
loss_record = []
|
||||
else:
|
||||
loss_record.append(loss.item())
|
||||
if len(loss_record) == 30:
|
||||
torch.save(model.state_dict(), 'model_param.pt')
|
||||
break
|
||||
|
||||
if (epoch + 1) % 1000 == 0:
|
||||
print('Epoch:', '%04d' % (epoch + 1), 'Loss = {:.6f}'.format(loss))
|
||||
torch.save(model.state_dict(), 'model_param.pt')
|
||||
|
||||
'''4.测试'''
|
||||
print('{}\nTest\n{}'.format('*' * 30, '*' * 30))
|
||||
test_text = 'sorry i hate you'
|
||||
# 返回包含每个单词序号的列表矩阵(为了有2个维度,还要加一个中括号升维)
|
||||
tests = [np.array([word_dict[n] for n in test_text.split()])]
|
||||
test_batch = torch.LongTensor(np.array(tests)).to(device)
|
||||
predict, attn_test = model(test_batch)
|
||||
predict = predict.data.max(1, keepdim=True)[1]
|
||||
print('The emotion of "%s" is ' % test_text, end='')
|
||||
if predict[0][0] == 0:
|
||||
print('bad!')
|
||||
else:
|
||||
print('good!')
|
||||
|
||||
'''5.可视化注意力权重矩阵'''
|
||||
fig = plt.figure(figsize=(0.5 * len(sentences), 0.5 * len(sentences[0]))) # [batch_size, n_step]
|
||||
ax = fig.add_subplot(1, 1, 1)
|
||||
# attention : (6, 3)
|
||||
ax.matshow(attention, cmap='viridis')
|
||||
word_show = ['单词'] * len(sentences[0])
|
||||
word_show = [word_show[i] + str(i + 1) for i in range(len(sentences[0]))] # ['word_1', 'word_2', 'word_3']
|
||||
ax.set_xticklabels([''] + word_show, fontdict={'fontsize': 14}, fontproperties='SimSun')
|
||||
sentence_show = ['句子'] * len(sentences)
|
||||
sentence_show = [sentence_show[i] + str(i + 1) for i in range(
|
||||
len(sentence_show))] # ['sentence_1', 'sentence_2', 'sentence_3', 'sentence_4', 'sentence_5', 'sentence_6']
|
||||
ax.set_yticklabels([''] + sentence_show, fontdict={'fontsize': 14}, fontproperties='SimSun')
|
||||
plt.show()
|
|
@ -0,0 +1,530 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Transformer编码器网络并行构建两个并行卷积神经网络(CNN)来对流量数据进行识别。将CNN用于空间特征表示, Transformer用于时间特征表示。
|
||||
|
||||
由于数据的顺序性质, 我们还将使用Transformer尝试尽可能准确地模拟情绪中音调转换之间的时间关系。
|
||||
|
||||
堆叠的CNN网络结合了来自变压器编码器的多头自关注层。
|
||||
|
||||
**利用CNN在空间特征表示方面的优势和Transformer在序列编码方面的优势
|
||||
|
||||
改进:并行化CNN,或者借鉴bert双向结构
|
||||
#### Setup
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os, glob, time
|
||||
import pickle
|
||||
from timeit import default_timer as timer
|
||||
|
||||
# matplot lib complains about librosa
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
# classes index
|
||||
traffic_dict ={
|
||||
0: 'Normal',
|
||||
1: 'BFSSH',
|
||||
2: 'Infilt',
|
||||
3: 'HttpDoS',
|
||||
4: 'DDoS'
|
||||
}
|
||||
|
||||
# network traffic attributes
|
||||
traffic_attributes = {
|
||||
'01': 'normal', # 正常流量
|
||||
'02': 'anomaly' # abnormal
|
||||
}
|
||||
|
||||
"""## Load Data
|
||||
"""
|
||||
|
||||
# path to data for glob
|
||||
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
|
||||
|
||||
def load_data():
|
||||
"""
|
||||
加载数据
|
||||
:return:
|
||||
"""
|
||||
t1 = timer()
|
||||
sessions = []
|
||||
labels = []
|
||||
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
|
||||
for i in range(num_pkls):
|
||||
# if i != 1:
|
||||
# continue
|
||||
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
|
||||
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
|
||||
sessions.extend(session_lists.values.tolist()) # 追加元素
|
||||
|
||||
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
|
||||
label_lists = pickle.load(open(label_pkl, 'rb'))
|
||||
labels.extend(label_lists.values.tolist())
|
||||
print(i)
|
||||
t2 = timer()
|
||||
print("load data tims: ", t2 - t1)
|
||||
|
||||
labels = np.array(labels)
|
||||
normal_indices = np.where(labels == 0)[0] # 结果所在的 行, 是个array
|
||||
# 数据量太大, 不好训练。以下注释代码可以选择100000条正常流量进入训练(建议在数据预处理阶段选择一定数量的正常数据——节约内存开支)
|
||||
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
|
||||
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行, 是个 list
|
||||
# np.random.choice 会重复抽样, 若想不重复, 增加参数:replace=False
|
||||
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
|
||||
test_attack_indices = np.concatenate( # 模态融合
|
||||
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
|
||||
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
|
||||
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
|
||||
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
|
||||
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
|
||||
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
|
||||
|
||||
return sessions, labels, train_indices, test_indices
|
||||
|
||||
|
||||
|
||||
"""# Architecture Overview
|
||||
|
||||
# CNN Motivation
|
||||
** 构建两个并行卷积神经网络(CNN)来对流量数据进行空间特征表示
|
||||
|
||||
# Transformer-Encoder Motivation
|
||||
**使用了Transformer-Encoder层
|
||||
**I maxpool 映射到Transformer, 以大大减少网络需要学习的参数数量
|
||||
"""
|
||||
|
||||
|
||||
class ByteBlock(nn.Module):
|
||||
"""
|
||||
1D FCN: 1维全卷积神经网络
|
||||
|
||||
in_channels:输入通道数, 在一维卷积中由于不存在通道数, 因此in_channels的数值为词向量的维度, 如果一个单词用128维向量表示, 那么in_channels = 128
|
||||
|
||||
out_channels:输出通道数, 表示经过卷积之后, 一个词向量嵌入维度应该为多少。如果out_channels = 64, 那么经过本次卷积之后的每个词的嵌入维度为64。
|
||||
|
||||
kernel_size:卷积核大小, 表示本次卷积核的维度, 一般是赋值为int类型。kernel_size=3, 表示每次卷积计算操作涉及到3个词, 也就是卷积核维度被设为(in_channels, kernel_size)。
|
||||
- 在Pytorch中, 对于一条语句序列数据的每个词都是用一个列向量表示
|
||||
|
||||
stride:滑动步长, 表示在卷积方向上滑动的步长。stride=2, 表示在当前卷积的范围为123, 下一个卷积范围就是345。
|
||||
|
||||
padding:填补操作, 表示在对特征矩阵剩余部分不足卷积时的操作。
|
||||
- str --> padding =“valid”:表示不填充, 剩余部分丢弃。 padding =“same”:表示在右侧填充之后要求输入输出序列长度一致
|
||||
- int --> padding = k: 表示在右侧填充k列
|
||||
"""
|
||||
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
|
||||
subsample=(2, 1), pool_length=(2, 2)):
|
||||
super(ByteBlock, self).__init__()
|
||||
|
||||
layers = []
|
||||
for i in range(len(nb_filter)):
|
||||
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
|
||||
padding=0, stride=subsample[i]))
|
||||
layers.append(nn.Tanh())
|
||||
if pool_length[i]:
|
||||
layers.append(nn.MaxPool1d(pool_length[i]))
|
||||
in_channels = nb_filter[i]
|
||||
|
||||
self.block = nn.Sequential(*layers)
|
||||
self.global_pool = nn.AdaptiveMaxPool1d(1)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = self.block(x)
|
||||
x = self.global_pool(x).squeeze(dim=2)
|
||||
x = torch.nn.functional.leaky_relu(x)
|
||||
return x
|
||||
|
||||
class FCN_Transformer(nn.Module):
|
||||
# Define all layers present in the network
|
||||
def __init__(self,num_emotions):
|
||||
super().__init__()
|
||||
|
||||
################ TRANSFORMER BLOCK #############################
|
||||
self.transformer_maxpool = nn.MaxPool1d(2)
|
||||
|
||||
# define single transformer encoder layer
|
||||
transformer_layer = nn.TransformerEncoderLayer(
|
||||
d_model=128, # 输入特征维度
|
||||
nhead=4, # 注意力头数
|
||||
dim_feedforward=512, # 前馈神经网络中隐藏层的维度。
|
||||
dropout=0.2,
|
||||
activation='relu'
|
||||
)
|
||||
|
||||
# Using 4 identical stacked encoder layers
|
||||
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
|
||||
|
||||
# 1 sequential conv1D layers
|
||||
self.conv1Dblock1 = ByteBlock(128, (128, 256), (5, 5), (1, 1), (2, 2))
|
||||
self.conv1Dblock2 = ByteBlock(128, (192, 320), (7, 5), (1, 1), (2, 2))
|
||||
|
||||
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
|
||||
|
||||
### Softmax layer for the n output logits from final FC linear layer
|
||||
self.softmax_out = nn.Softmax(dim=1)
|
||||
|
||||
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
|
||||
def forward(self,x):
|
||||
|
||||
conv1d_embedding1 = self.conv1Dblock1(x) # x == N/batch * channel * freq * time
|
||||
|
||||
conv1d_embedding1 = torch.flatten(conv1d_embedding1, start_dim=1)
|
||||
|
||||
conv1d_embedding2 = self.conv1Dblock2(x)
|
||||
|
||||
conv1d_embedding2 = torch.flatten(conv1d_embedding2, start_dim=1)
|
||||
|
||||
|
||||
########## 4-encoder-layer Transformer block ##############
|
||||
x_maxpool = self.transformer_maxpool(x)
|
||||
|
||||
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
|
||||
|
||||
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
|
||||
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
|
||||
x = x_maxpool_reduced.permute(2,0,1)
|
||||
# print("x_maxpool_reduced: ",x_maxpool_reduced.shape)
|
||||
|
||||
transformer_output = self.transformer_encoder(x)
|
||||
|
||||
transformer_embedding = torch.mean(transformer_output, dim=0) # dim
|
||||
|
||||
complete_embedding = torch.cat([conv1d_embedding1, conv1d_embedding2,transformer_embedding], dim=1)
|
||||
|
||||
output_logits = self.fc1_linear(complete_embedding)
|
||||
|
||||
output_softmax = self.softmax_out(output_logits)
|
||||
|
||||
return output_logits, output_softmax
|
||||
|
||||
|
||||
"""# 查看模型结构
|
||||
"""
|
||||
from torchsummary import summary
|
||||
|
||||
# need device to instantiate model
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# instantiate model for 8 emotions and move to CPU for summary
|
||||
model = FCN_Transformer(len(traffic_dict)).to(device)
|
||||
print("\nmodel: \n", model)
|
||||
# include input feature map dims in call to summary()
|
||||
summary(model, input_size=(128,200,512))
|
||||
|
||||
|
||||
"""## Define Loss/Criterion
|
||||
"""
|
||||
# define loss function; CrossEntropyLoss() fairly standard for multiclass problems
|
||||
def criterion(predictions, targets):
|
||||
return nn.CrossEntropyLoss()(input=predictions, target=targets)
|
||||
|
||||
"""## Choose Optimizer
|
||||
https://github.com/IliaZenkov/transformer-cnn-emotion-recognition/blob/main/Parallel_is_All_You_Want.py
|
||||
有说为什么选SGD
|
||||
# optimizer = torch.optim.SGD(model.parameters(),lr=0.01, weight_decay=1e-3, momentum=0.8)
|
||||
"""
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
|
||||
|
||||
"""## Define Training Step
|
||||
定义了一个函数来返回单个训练步骤, 定义了模型的一次迭代。
|
||||
1、正向传递输出logits和softmax概率。
|
||||
2、记录softmax概率以跟踪准确性。
|
||||
3、将输出logits传递给损失函数以计算损失。
|
||||
4、调用带损失函数的反向传递(反向传播错误)。
|
||||
5、告诉优化器对网络参数应用一个更新步骤。
|
||||
6、为下一次迭代将优化器中的累积梯度归零。
|
||||
"""
|
||||
# define function to create a single step of the training phase
|
||||
def make_train_step(model, criterion, optimizer):
|
||||
|
||||
# define the training step of the training phase
|
||||
def train_step(X,Y):
|
||||
|
||||
# forward pass
|
||||
output_logits, output_softmax = model(X)
|
||||
predictions = torch.argmax(output_softmax,dim=1)
|
||||
accuracy = torch.sum(Y==predictions)/float(len(Y))
|
||||
|
||||
# compute loss on logits because nn.CrossEntropyLoss implements log softmax
|
||||
loss = criterion(output_logits, Y)
|
||||
|
||||
# compute gradients for the optimizer to use
|
||||
loss.backward()
|
||||
|
||||
# update network parameters based on gradient stored (by calling loss.backward())
|
||||
optimizer.step()
|
||||
|
||||
# zero out gradients for next pass
|
||||
# pytorch accumulates gradients from backwards passes
|
||||
optimizer.zero_grad()
|
||||
|
||||
return loss.item(), accuracy*100
|
||||
return train_step
|
||||
|
||||
"""## Define Validation Step
|
||||
定义一个函数, 在10%的X,y张量对上返回一个验证步骤, 以了解模型在训练时的泛化性, 以便确定是否以及何时停止它并调整超参数。
|
||||
通过将model设置为验证模式来确保在验证过程中不更新网络参数。不要在验证阶段通过设置torch.no_grad()来浪费资源计算梯度。
|
||||
"""
|
||||
def make_validate_fnc(model,criterion):
|
||||
def validate(X,Y):
|
||||
|
||||
# don't want to update any network parameters on validation passes: don't need gradient
|
||||
# wrap in torch.no_grad to save memory and compute in validation phase:
|
||||
with torch.no_grad():
|
||||
|
||||
# set model to validation phase i.e. turn off dropout and batchnorm layers
|
||||
model.eval()
|
||||
|
||||
# get the model's predictions on the validation set
|
||||
output_logits, output_softmax = model(X)
|
||||
predictions = torch.argmax(output_softmax,dim=1)
|
||||
|
||||
# calculate the mean accuracy over the entire validation set
|
||||
accuracy = torch.sum(Y==predictions)/float(len(Y))
|
||||
|
||||
# compute error from logits (nn.crossentropy implements softmax)
|
||||
loss = criterion(output_logits,Y)
|
||||
|
||||
return loss.item(), accuracy*100, predictions
|
||||
return validate
|
||||
|
||||
"""# Make Checkpoint Functions
|
||||
在每个epoch之后保存模型状态的检查点。当对模型的性能感到满意时, 可以中断训练并加载适当的模型二进制文件。
|
||||
|
||||
-硬件/软件故障恢复培训
|
||||
-通过调整后从检查点进行训练来保存计算重新训练
|
||||
-通过保持模型最高性能版本的快照, 轻松实现早期停止
|
||||
"""
|
||||
def make_save_checkpoint():
|
||||
def save_checkpoint(optimizer, model, epoch, filename):
|
||||
checkpoint_dict = {
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'model': model.state_dict(),
|
||||
'epoch': epoch
|
||||
}
|
||||
torch.save(checkpoint_dict, filename)
|
||||
return save_checkpoint
|
||||
|
||||
def load_checkpoint(optimizer, model, filename):
|
||||
checkpoint_dict = torch.load(filename)
|
||||
epoch = checkpoint_dict['epoch']
|
||||
model.load_state_dict(checkpoint_dict['model'])
|
||||
if optimizer is not None:
|
||||
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
||||
return epoch
|
||||
|
||||
"""# Build Training Loop
|
||||
使用训练和验证步骤函数构建完整的训练循环。
|
||||
|
||||
<br>
|
||||
训练循环逻辑:
|
||||
|
||||
--Setup--
|
||||
实例化 model.
|
||||
实例化模型训练和验证步骤, loss function 和 optimizer.
|
||||
Move model to GPU.
|
||||
|
||||
--Epoch--
|
||||
在每个epoch后验证阶段完成后, 将模型设置为训练模式。
|
||||
Shuffle 每个epoch的训练集, 重置epoch损失和精度。
|
||||
|
||||
--Iteration--
|
||||
维每次迭代创建 mini_batch 的 X_train, y_train 张量, 并将张量移动到GPU。
|
||||
Take 1 train step with X_train, y_train minibatch tensors.
|
||||
汇总每次迭代的准确性和损失, 但只在每个epoch之后记录。
|
||||
|
||||
--Epoch--
|
||||
计算并记录整个epoch的验证精度, 以跟踪学习进度。
|
||||
在每个epoch之后打印训练指标。
|
||||
"""
|
||||
|
||||
# get training set size to calculate # iterations and minibatch indices
|
||||
train_size = X_train.shape[0]
|
||||
|
||||
# pick minibatch size (of 32... always)
|
||||
minibatch = 32
|
||||
|
||||
# set device to GPU
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
print(f'{device} selected')
|
||||
|
||||
# instantiate model and move to GPU for training
|
||||
model = FCN_Transformer(num_emotions=len(traffic_dict)).to(device)
|
||||
print('Number of trainable params: ',sum(p.numel() for p in model.parameters()) )
|
||||
|
||||
# instantiate the checkpoint save function
|
||||
save_checkpoint = make_save_checkpoint()
|
||||
|
||||
# instantiate the training step function
|
||||
train_step = make_train_step(model, criterion, optimizer=optimizer)
|
||||
|
||||
# instantiate the validation loop function
|
||||
validate = make_validate_fnc(model,criterion)
|
||||
|
||||
# instantiate lists to hold scalar performance metrics to plot later
|
||||
train_losses=[]
|
||||
valid_losses = []
|
||||
|
||||
# create training loop for one complete epoch (entire training set)
|
||||
def train(optimizer, model, num_epochs, X_train, Y_train, X_valid, Y_valid):
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
|
||||
# set model to train phase
|
||||
model.train()
|
||||
|
||||
# shuffle entire training set in each epoch to randomize minibatch order
|
||||
ind = np.random.permutation(train_size)
|
||||
|
||||
# shuffle the training set for each epoch:
|
||||
X_train = X_train[ind,:,:,:]
|
||||
Y_train = Y_train[ind]
|
||||
|
||||
# instantiate scalar values to keep track of progress after each epoch so we can stop training when appropriate
|
||||
epoch_acc = 0
|
||||
epoch_loss = 0
|
||||
num_iterations = int(train_size / minibatch)
|
||||
|
||||
# create a loop for each minibatch of 32 samples:
|
||||
for i in range(num_iterations):
|
||||
|
||||
# we have to track and update minibatch position for the current minibatch
|
||||
# if we take a random batch position from a set, we almost certainly will skip some of the data in that set
|
||||
# track minibatch position based on iteration number:
|
||||
batch_start = i * minibatch
|
||||
# ensure we don't go out of the bounds of our training set:
|
||||
batch_end = min(batch_start + minibatch, train_size)
|
||||
# ensure we don't have an index error
|
||||
actual_batch_size = batch_end-batch_start
|
||||
|
||||
# get training minibatch with all channnels and 1D feature dims
|
||||
X = X_train[batch_start:batch_end,:,:,:]
|
||||
# get training minibatch labels
|
||||
Y = Y_train[batch_start:batch_end]
|
||||
|
||||
# instantiate training tensors
|
||||
X_tensor = torch.tensor(X, device=device).float()
|
||||
Y_tensor = torch.tensor(Y, dtype=torch.long,device=device)
|
||||
|
||||
# Pass input tensors thru 1 training step (fwd+backwards pass)
|
||||
loss, acc = train_step(X_tensor,Y_tensor)
|
||||
|
||||
# aggregate batch accuracy to measure progress of entire epoch
|
||||
epoch_acc += acc * actual_batch_size / train_size
|
||||
epoch_loss += loss * actual_batch_size / train_size
|
||||
|
||||
# keep track of the iteration to see if the model's too slow
|
||||
print('\r'+f'Epoch {epoch}: iteration {i}/{num_iterations}',end='')
|
||||
|
||||
# create tensors from validation set
|
||||
X_valid_tensor = torch.tensor(X_valid,device=device).float()
|
||||
Y_valid_tensor = torch.tensor(Y_valid,dtype=torch.long,device=device)
|
||||
|
||||
# calculate validation metrics to keep track of progress; don't need predictions now
|
||||
valid_loss, valid_acc, _ = validate(X_valid_tensor,Y_valid_tensor)
|
||||
|
||||
# accumulate scalar performance metrics at each epoch to track and plot later
|
||||
train_losses.append(epoch_loss)
|
||||
valid_losses.append(valid_loss)
|
||||
|
||||
# Save checkpoint of the model
|
||||
checkpoint_filename = './checkpoints/FCN_TransformerFINAL-{:03d}.pkl'.format(epoch)
|
||||
save_checkpoint(optimizer, model, epoch, checkpoint_filename)
|
||||
|
||||
# keep track of each epoch's progress
|
||||
print(f'\nEpoch {epoch} --- loss:{epoch_loss:.3f}, Epoch accuracy:{epoch_acc:.2f}%, Validation loss:{valid_loss:.3f}, Validation accuracy:{valid_acc:.2f}%')
|
||||
|
||||
# choose number of epochs higher than reasonable so we can manually stop training
|
||||
num_epochs = 100
|
||||
|
||||
# train it!
|
||||
train(optimizer, model, num_epochs, X_train, y_train, X_valid, y_valid)
|
||||
|
||||
"""# Check the Loss Curve's Behaviour
|
||||
Let's see if we missed something egregious during training.
|
||||
"""
|
||||
plt.title('Loss Curve for Model')
|
||||
plt.ylabel('Loss', fontsize=16)
|
||||
plt.xlabel('Epoch', fontsize=16)
|
||||
plt.plot(train_losses[:],'b')
|
||||
plt.plot(valid_losses[:],'r')
|
||||
plt.legend(['Training loss','Validation loss'])
|
||||
plt.show()
|
||||
|
||||
|
||||
"""
|
||||
# 加载训练好的模型进行验证
|
||||
"""
|
||||
# pick load folder
|
||||
load_folder = './checkpoints'
|
||||
|
||||
# pick the epoch to load
|
||||
epoch = '60'
|
||||
model_name = f'FCN_TransformerFINAL-{epoch}.pkl'
|
||||
|
||||
# make full load path
|
||||
load_path = os.path.join(load_folder, model_name)
|
||||
|
||||
## instantiate empty model and populate with params from binary
|
||||
model = FCN_Transformer(len(traffic_dict))
|
||||
load_checkpoint(optimizer, model, load_path)
|
||||
|
||||
print(f'Loaded model from {load_path}')
|
||||
|
||||
"""# Evaluate the Model on Hold-Out Test Set
|
||||
"""
|
||||
|
||||
# reinitialize validation function with model from chosen checkpoint
|
||||
validate = make_validate_fnc(model,criterion)
|
||||
|
||||
# Convert 4D test feature set array to tensor and move to GPU
|
||||
X_test_tensor = torch.tensor(X_test,device=device).float()
|
||||
# Convert 4D test label set array to tensor and move to GPU
|
||||
y_test_tensor = torch.tensor(y_test,dtype=torch.long,device=device)
|
||||
|
||||
# Get the model's performance metrics using the validation function we defined
|
||||
test_loss, test_acc, predicted_emotions = validate(X_test_tensor,y_test_tensor)
|
||||
|
||||
print(f'Test accuracy is {test_acc:.2f}%')
|
||||
|
||||
"""
|
||||
# Analyze Performance on Test Set
|
||||
"""
|
||||
|
||||
from sklearn.metrics import confusion_matrix
|
||||
import seaborn as sn
|
||||
|
||||
# because model tested on GPU, move prediction tensor to CPU then convert to array
|
||||
predicted_emotions = predicted_emotions.cpu().numpy()
|
||||
# use labels from test set
|
||||
emotions_groundtruth = y_test
|
||||
|
||||
# build confusion matrix and normalized confusion matrix
|
||||
conf_matrix = confusion_matrix(emotions_groundtruth, predicted_emotions)
|
||||
conf_matrix_norm = confusion_matrix(emotions_groundtruth, predicted_emotions,normalize='true')
|
||||
|
||||
# set labels for matrix axes from emotions
|
||||
emotion_names = [emotion for emotion in traffic_dict.values()]
|
||||
|
||||
# make a confusion matrix with labels using a DataFrame
|
||||
confmatrix_df = pd.DataFrame(conf_matrix, index=emotion_names, columns=emotion_names)
|
||||
confmatrix_df_norm = pd.DataFrame(conf_matrix_norm, index=emotion_names, columns=emotion_names)
|
||||
|
||||
# plot confusion matrices
|
||||
plt.figure(figsize=(16,6))
|
||||
sn.set(font_scale=1.8) # emotion label and title size
|
||||
plt.subplot(1,2,1)
|
||||
plt.title('Confusion Matrix')
|
||||
sn.heatmap(confmatrix_df, annot=True, annot_kws={"size": 18}) #annot_kws is value font
|
||||
plt.subplot(1,2,2)
|
||||
plt.title('Normalized Confusion Matrix')
|
||||
sn.heatmap(confmatrix_df_norm, annot=True, annot_kws={"size": 13}) #annot_kws is value font
|
||||
|
||||
plt.show()
|
|
@ -0,0 +1,14 @@
|
|||
2013-11-09 SplitCap 2.1
|
||||
|
||||
* Support for reading PCAP data from stdin with "-r -", for example in order to run:
|
||||
tcpdump -i eth0 -w - | mono SplitCap.exe -r -
|
||||
|
||||
2013-06-25 SplitCap 2.0
|
||||
|
||||
* Changed from "\\" to System.IO.Path.DirectorySeparatorChar in order to run better
|
||||
on Linux and other non-Windows platforms.
|
||||
Hint: use Mono framework to run SplitCap in Linux.
|
||||
Installation in Ubuntu with: apt-get install libmono2.0-cil
|
||||
|
||||
* Added "-s seconds <s>" and "-s packets <c>" to split pcap files based on
|
||||
time or packet count (much like editcap).
|
|
@ -0,0 +1,19 @@
|
|||
Copyright 2008-2011, Erik Hjelmvik <erik.hjelmvik[at]gmail.com>
|
||||
SplitCap is available from http://www.netresec.com/?page=SplitCap
|
||||
|
||||
SOFTWARE LICENSE
|
||||
|
||||
SplitCap is licensed under the GNU General Public License Version 3.
|
||||
http://www.gnu.org/licenses/gpl.html
|
||||
|
||||
SplitCap uses the assemblies PacketParser and PcapFileHandler, which both stem
|
||||
from the NetworkMiner open source network forensics application available at:
|
||||
http://networkminer.sourceforge.net/
|
||||
|
||||
The SplitCap open source project space is available on SourceForge:
|
||||
http://sourceforge.net/projects/splitcap/
|
||||
|
||||
|
||||
SplitCap was initially created as part of the Statistical Protocol IDentification
|
||||
research project carried out by Erik Hjelmvik with fundings from .SE.
|
||||
More info on .SE is available at: http://www.iis.se/en/
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,55 @@
|
|||
# Author: @vinesmsuic
|
||||
#
|
||||
#
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
import random
|
||||
import argparse
|
||||
|
||||
def parser():
|
||||
parser = argparse.ArgumentParser(description="Copying files")
|
||||
parser.add_argument("--limit", type=int, required=False, default=-1, help="only copy a number of files each folder")
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parser()
|
||||
|
||||
src_dir = os.path.join('2_Flow','AllLayers')
|
||||
dst_dir = os.path.join('2_Flow_Processed','AllLayers')
|
||||
|
||||
if not os.path.exists(dst_dir):
|
||||
os.makedirs(dst_dir)
|
||||
|
||||
folders = os.listdir(src_dir)
|
||||
for folder in folders:
|
||||
|
||||
if not os.path.exists(os.path.join(dst_dir,folder)):
|
||||
os.makedirs(os.path.join(dst_dir,folder))
|
||||
|
||||
if(os.path.isdir(os.path.join(dst_dir,folder))):
|
||||
|
||||
print("Now Processing Folder: ", folder)
|
||||
|
||||
copying_folders = os.listdir(os.path.join(src_dir,folder))
|
||||
|
||||
random.seed(72)
|
||||
random.shuffle(copying_folders)
|
||||
|
||||
if(args.limit!= -1):
|
||||
if(len(copying_folders) > args.limit):
|
||||
copying_folders = copying_folders[:args.limit]
|
||||
elif(len(copying_folders) < args.limit):
|
||||
print("Folder "+str(folder), "does not have required "+str(args.limit) + "files. Folder only has "+str(len(copying_folders)) + " files.")
|
||||
|
||||
|
||||
|
||||
for f in tqdm(copying_folders):
|
||||
full_file_name = os.path.join(src_dir, folder, f)
|
||||
if os.path.isfile(full_file_name):
|
||||
shutil.copy(full_file_name, os.path.join(dst_dir, folder))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Binary file not shown.
|
@ -0,0 +1,17 @@
|
|||
foreach($f in gci 1_Pcap *.pcap)
|
||||
{
|
||||
echo "Now processing file : $f"
|
||||
0_Tool\SplitCap_2-1\SplitCap -p 100000 -b 100000 -r $f.FullName -s flow -o 2_Flow\AllLayers\$($f.BaseName)-ALL
|
||||
#0_Tool\SplitCap_2-1\SplitCap -p 100000 -b 100000 -r $f.FullName -s flow -o 2_Flow\L7\$($f.BaseName)-L7 -y L7
|
||||
|
||||
echo "Done Spliting! Now Clearing 0KB size files..."
|
||||
# Delete pcap files length equal to 0
|
||||
gci 2_Flow\AllLayers\$($f.BaseName)-ALL | ?{$_.Length -eq 0} | del
|
||||
echo "-------------------------------------------------"
|
||||
}
|
||||
|
||||
echo "Now Eliminating duplicate flows..."
|
||||
# Eliminate duplicate Flows
|
||||
0_Tool\finddupe -del 2_Flow\AllLayers
|
||||
echo "-------------------------------------------------"
|
||||
echo "Finished"
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue