main
Huey 2024-06-29 14:23:18 +08:00
parent 88306d79fa
commit 4a9a683850
143 changed files with 4207 additions and 0 deletions

185
LYZ/BiLSTM.py Normal file
View File

@ -0,0 +1,185 @@
"""
Task: 基于Bi-LSTM和注意力机制的文本情感分类
Author: ChengJunkai @github.com/Cheng0829
Email: chengjunkai829@gmail.com
Date: 2022/09/14
Reference: Tae Hwan Jung(Jeff Jung) @graykode
"""
import numpy as np
import torch, time, os, sys
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
'''1.数据预处理'''
def pre_process(sentences):
word_sequence = " ".join(sentences).split()
word_list = []
'''
如果用list(set(word_sequence))来去重,得到的将是一个随机顺序的列表(因为set无序),
这样得到的字典不同,保存的上一次训练的模型很有可能在这一次不能用
(比如上一次的模型预测碰见i:0,love:1,就输出you:2,但这次模型you在字典3号位置,也就无法输出正确结果)
'''
for word in word_sequence:
if word not in word_list:
word_list.append(word)
word_dict = {w: i for i, w in enumerate(word_list)}
word_dict["''"] = len(word_dict)
word_list = word_list.append("''")
vocab_size = len(word_dict) # 词库大小16
max_size = 0
for sen in sentences:
if len(sen.split()) > max_size:
max_size = len(sen.split()) # 最大长度3
for i in range(len(sentences)):
if len(sentences[i].split()) < max_size:
sentences[i] = sentences[i] + " ''" * (max_size - len(sentences[i].split()))
return sentences, word_list, word_dict, vocab_size, max_size
def make_batch(sentences):
# 对于每个句子,返回包含句子内每个单词序号的列表
inputs = [np.array([word_dict[n] for n in sen.split()]) for sen in sentences] # [6,3]
targets = [out for out in labels]
inputs = torch.LongTensor(np.array(inputs)).to(device)
targets = torch.LongTensor(np.array(targets)).to(device)
'''情感分类构建嵌入矩阵,没有eye()'''
return inputs, targets
class BiLSTM_Attention(nn.Module):
def __init__(self):
super(BiLSTM_Attention, self).__init__()
'''情感分类构建嵌入矩阵,没有eye()'''
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
self.out = nn.Linear(2 * n_hidden, num_classes)
def forward(self, X): # X: [6, 3]
# input : [batch_size, n_step, embedding_dim] [6,3,2]
input = self.embedding(X)
# input : [n_step, batch_size, embedding_dim] [3,6,2]
# input : [输入序列长度(时间步长度),样本数,嵌入向量维度]
input = input.permute(1, 0, 2)
# hidden_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden]
# hidden_state : [层数*网络方向,样本数,隐藏层的维度(隐藏层神经元个数)]
hidden_state = torch.zeros(1 * 2, len(X), n_hidden).to(device)
# cell_state : [num_layers*num_directions, batch_size, hidden_size]
# cell_state : [层数*网络方向,样本数,隐藏层的维度(隐藏层神经元个数)]
cell_state = torch.zeros(1 * 2, len(X), n_hidden).to(device)
# final_hidden_state, final_cell_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden]
ltsm_output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
# ltsm_output : [batch_size, n_step, n_hidden*num_directions(=2)]
ltsm_output = ltsm_output.permute(1, 0, 2)
attn_output, attention = self.attention_net(ltsm_output, final_hidden_state)
# model : [batch_size, num_classes], attention : [batch_size, n_step]
return self.out(attn_output), attention
'''两次bmm加权求和,相当于两次for循环'''
# lstm_output : [batch_size, n_step, n_hidden*num_directions(=2)] [6,3,16]
# final_hidden_state : [num_layers(=1)*num_directions(=2), batch_size, n_hidden] [2,6,8]
def attention_net(self, lstm_output, final_hidden_state):
# final_hidden_state : [batch_size, n_hidden*num_directions(=2), 1(=n_layer)] [6,16,1]
final_hidden_state = final_hidden_state.view(-1, 2 * n_hidden, 1)
'''第一次bmm加权求和:: lstm_output和final_hidden_state生成注意力权重attn_weights'''
# [6,3,16]*[6,16,1] -> [6,3,1] -> attn_weights : [batch_size, n_step] [6,3]
attn_weights = torch.bmm(lstm_output, final_hidden_state).squeeze(2) # 第3维度降维
softmax_attn_weights = F.softmax(attn_weights, 1) # 按列求值 [6,3]
'''第二次bmm加权求和 : lstm_output和注意力权重attn_weights生成上下文向量context,即融合了注意力的模型输出'''
# [batch_size, n_hidden*num_directions, n_step] * [batch_size,n_step,1] \
# = [batch_size, n_hidden*num_directions, 1] : [6,16,3] * [6,3,1] -> [6,16,1] -> [6,16]
context = torch.bmm(lstm_output.transpose(1, 2), softmax_attn_weights.unsqueeze(2)).squeeze(2)
softmax_attn_weights = softmax_attn_weights.to('cpu') # numpy变量只能在cpu上
'''各个任务求出context之后的步骤不同,LSTM的上下文不需要和Seq2Seq中的一样和decoder_output连接'''
return context, softmax_attn_weights.data.numpy()
if __name__ == '__main__':
chars = 30 * '*'
embedding_dim = 3 # embedding size
n_hidden = 8 # number of hidden units in one cell
num_classes = 2 # 0 or 1
'''GPU比CPU慢的原因大致为:
数据传输会有很大的开销,而GPU处理数据传输要比CPU慢,
而GPU在矩阵计算上的优势在小规模神经网络中无法明显体现出来
'''
device = ['cuda:0' if torch.cuda.is_available() else 'cpu'][0]
# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "don't leave",
"i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0] # 1 is good, 0 is not good.
'''1.数据预处理'''
sentences, word_list, word_dict, vocab_size, max_size = pre_process(sentences)
inputs, targets = make_batch(sentences)
'''2.构建模型'''
model = BiLSTM_Attention()
print(model)
model.to(device)
criterion = nn.CrossEntropyLoss() # 交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
if os.path.exists('model_param.pt'):
# 加载模型参数到模型结构
model.load_state_dict(torch.load('model_param.pt', map_location=device))
'''3.训练'''
print('{}\nTrain\n{}'.format('*' * 30, '*' * 30))
loss_record = []
for epoch in range(10000):
optimizer.zero_grad()
output, attention = model(inputs)
output = output.to(device)
loss = criterion(output, targets)
loss.backward()
optimizer.step()
# print(loss)
if loss >= 0.001: # 连续30轮loss小于0.01则提前结束训练
loss_record = []
else:
loss_record.append(loss.item())
if len(loss_record) == 30:
torch.save(model.state_dict(), 'model_param.pt')
break
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'Loss = {:.6f}'.format(loss))
torch.save(model.state_dict(), 'model_param.pt')
'''4.测试'''
print('{}\nTest\n{}'.format('*' * 30, '*' * 30))
test_text = 'sorry i hate you'
# 返回包含每个单词序号的列表矩阵(为了有2个维度,还要加一个中括号升维)
tests = [np.array([word_dict[n] for n in test_text.split()])]
test_batch = torch.LongTensor(np.array(tests)).to(device)
predict, attn_test = model(test_batch)
predict = predict.data.max(1, keepdim=True)[1]
print('The emotion of "%s" is ' % test_text, end='')
if predict[0][0] == 0:
print('bad!')
else:
print('good!')
'''5.可视化注意力权重矩阵'''
fig = plt.figure(figsize=(0.5 * len(sentences), 0.5 * len(sentences[0]))) # [batch_size, n_step]
ax = fig.add_subplot(1, 1, 1)
# attention : (6, 3)
ax.matshow(attention, cmap='viridis')
word_show = ['单词'] * len(sentences[0])
word_show = [word_show[i] + str(i + 1) for i in range(len(sentences[0]))] # ['word_1', 'word_2', 'word_3']
ax.set_xticklabels([''] + word_show, fontdict={'fontsize': 14}, fontproperties='SimSun')
sentence_show = ['句子'] * len(sentences)
sentence_show = [sentence_show[i] + str(i + 1) for i in range(
len(sentence_show))] # ['sentence_1', 'sentence_2', 'sentence_3', 'sentence_4', 'sentence_5', 'sentence_6']
ax.set_yticklabels([''] + sentence_show, fontdict={'fontsize': 14}, fontproperties='SimSun')
plt.show()

530
LYZ/FCN_Transformer.py Normal file
View File

@ -0,0 +1,530 @@
# -*- coding: utf-8 -*-
"""
Transformer编码器网络并行构建两个并行卷积神经网络(CNN)来对流量数据进行识别将CNN用于空间特征表示, Transformer用于时间特征表示
由于数据的顺序性质, 我们还将使用Transformer尝试尽可能准确地模拟情绪中音调转换之间的时间关系
堆叠的CNN网络结合了来自变压器编码器的多头自关注层
**利用CNN在空间特征表示方面的优势和Transformer在序列编码方面的优势
改进并行化CNN或者借鉴bert双向结构
#### Setup
"""
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, glob, time
import pickle
from timeit import default_timer as timer
# matplot lib complains about librosa
import warnings
warnings.filterwarnings('ignore')
# classes index
traffic_dict ={
0: 'Normal',
1: 'BFSSH',
2: 'Infilt',
3: 'HttpDoS',
4: 'DDoS'
}
# network traffic attributes
traffic_attributes = {
'01': 'normal', # 正常流量
'02': 'anomaly' # abnormal
}
"""## Load Data
"""
# path to data for glob
DATA_DIR = 'D:\\PyProject\\malware_traffic\\3_Packet\\'
def load_data():
"""
加载数据
:return:
"""
t1 = timer()
sessions = []
labels = []
num_pkls = len(glob.glob(DATA_DIR + 'ISCX2012_labels_*.pkl')) # 匹配路径
for i in range(num_pkls):
# if i != 1:
# continue
session_pkl = DATA_DIR + 'ISCX2012_pcaps_' + str(i) + '.pkl'
session_lists = pickle.load(open(session_pkl, 'rb')) # 反序列化对象
sessions.extend(session_lists.values.tolist()) # 追加元素
label_pkl = DATA_DIR + 'ISCX2012_labels_' + str(i) + '.pkl'
label_lists = pickle.load(open(label_pkl, 'rb'))
labels.extend(label_lists.values.tolist())
print(i)
t2 = timer()
print("load data tims: ", t2 - t1)
labels = np.array(labels)
normal_indices = np.where(labels == 0)[0] # 结果所在的 行, 是个array
# 数据量太大, 不好训练。以下注释代码可以选择100000条正常流量进入训练建议在数据预处理阶段选择一定数量的正常数据——节约内存开支
normal_indices = np.random.choice(normal_indices, 100000, replace=False) # 注释代码
attack_indices = [np.where(labels == i)[0] for i in range(1, 5)] # label 1~4 所在行, 是个 list
# np.random.choice 会重复抽样, 若想不重复, 增加参数replace=False
test_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * 0.4), replace=False)
test_attack_indices = np.concatenate( # 模态融合
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * 0.4), replace=False) for i in range(4)])
test_indices = np.concatenate([test_normal_indices, test_attack_indices]).astype(int)
# train_indices = np.array(list(set(np.arange(len(labels))) - set(test_indices)))
attack_indices = np.concatenate(attack_indices).astype(int) # 注释代码
indices = np.concatenate([normal_indices, attack_indices]).astype(int) # 注释代码
train_indices = np.array(list(set(indices) - set(test_indices))) # 注释代码
return sessions, labels, train_indices, test_indices
"""# Architecture Overview
# CNN Motivation
** 构建两个并行卷积神经网络(CNN)来对流量数据进行空间特征表示
# Transformer-Encoder Motivation
**使用了Transformer-Encoder层
**I maxpool 映射到Transformer, 以大大减少网络需要学习的参数数量
"""
class ByteBlock(nn.Module):
"""
1D FCN: 1维全卷积神经网络
in_channels输入通道数, 在一维卷积中由于不存在通道数, 因此in_channels的数值为词向量的维度, 如果一个单词用128维向量表示, 那么in_channels = 128
out_channels输出通道数, 表示经过卷积之后, 一个词向量嵌入维度应该为多少如果out_channels = 64, 那么经过本次卷积之后的每个词的嵌入维度为64
kernel_size卷积核大小, 表示本次卷积核的维度, 一般是赋值为int类型kernel_size=3, 表示每次卷积计算操作涉及到3个词, 也就是卷积核维度被设为(in_channels, kernel_size)
- 在Pytorch中, 对于一条语句序列数据的每个词都是用一个列向量表示
stride滑动步长, 表示在卷积方向上滑动的步长stride=2, 表示在当前卷积的范围为123, 下一个卷积范围就是345
padding填补操作, 表示在对特征矩阵剩余部分不足卷积时的操作
- str --> padding =valid:表示不填充, 剩余部分丢弃 padding =same:表示在右侧填充之后要求输入输出序列长度一致
- int --> padding = k 表示在右侧填充k列
"""
def __init__(self, in_channels, nb_filter=(64, 100), filter_length=(3, 3),
subsample=(2, 1), pool_length=(2, 2)):
super(ByteBlock, self).__init__()
layers = []
for i in range(len(nb_filter)):
layers.append(nn.Conv1d(in_channels, nb_filter[i], kernel_size=filter_length[i],
padding=0, stride=subsample[i]))
layers.append(nn.Tanh())
if pool_length[i]:
layers.append(nn.MaxPool1d(pool_length[i]))
in_channels = nb_filter[i]
self.block = nn.Sequential(*layers)
self.global_pool = nn.AdaptiveMaxPool1d(1)
def forward(self, x):
x = self.block(x)
x = self.global_pool(x).squeeze(dim=2)
x = torch.nn.functional.leaky_relu(x)
return x
class FCN_Transformer(nn.Module):
# Define all layers present in the network
def __init__(self,num_emotions):
super().__init__()
################ TRANSFORMER BLOCK #############################
self.transformer_maxpool = nn.MaxPool1d(2)
# define single transformer encoder layer
transformer_layer = nn.TransformerEncoderLayer(
d_model=128, # 输入特征维度
nhead=4, # 注意力头数
dim_feedforward=512, # 前馈神经网络中隐藏层的维度。
dropout=0.2,
activation='relu'
)
# Using 4 identical stacked encoder layers
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
# 1 sequential conv1D layers
self.conv1Dblock1 = ByteBlock(128, (128, 256), (5, 5), (1, 1), (2, 2))
self.conv1Dblock2 = ByteBlock(128, (192, 320), (7, 5), (1, 1), (2, 2))
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
### Softmax layer for the n output logits from final FC linear layer
self.softmax_out = nn.Softmax(dim=1)
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
def forward(self,x):
conv1d_embedding1 = self.conv1Dblock1(x) # x == N/batch * channel * freq * time
conv1d_embedding1 = torch.flatten(conv1d_embedding1, start_dim=1)
conv1d_embedding2 = self.conv1Dblock2(x)
conv1d_embedding2 = torch.flatten(conv1d_embedding2, start_dim=1)
########## 4-encoder-layer Transformer block ##############
x_maxpool = self.transformer_maxpool(x)
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
x = x_maxpool_reduced.permute(2,0,1)
# print("x_maxpool_reduced: ",x_maxpool_reduced.shape)
transformer_output = self.transformer_encoder(x)
transformer_embedding = torch.mean(transformer_output, dim=0) # dim
complete_embedding = torch.cat([conv1d_embedding1, conv1d_embedding2,transformer_embedding], dim=1)
output_logits = self.fc1_linear(complete_embedding)
output_softmax = self.softmax_out(output_logits)
return output_logits, output_softmax
"""# 查看模型结构
"""
from torchsummary import summary
# need device to instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# instantiate model for 8 emotions and move to CPU for summary
model = FCN_Transformer(len(traffic_dict)).to(device)
print("\nmodel: \n", model)
# include input feature map dims in call to summary()
summary(model, input_size=(128,200,512))
"""## Define Loss/Criterion
"""
# define loss function; CrossEntropyLoss() fairly standard for multiclass problems
def criterion(predictions, targets):
return nn.CrossEntropyLoss()(input=predictions, target=targets)
"""## Choose Optimizer
https://github.com/IliaZenkov/transformer-cnn-emotion-recognition/blob/main/Parallel_is_All_You_Want.py
有说为什么选SGD
# optimizer = torch.optim.SGD(model.parameters(),lr=0.01, weight_decay=1e-3, momentum=0.8)
"""
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
"""## Define Training Step
定义了一个函数来返回单个训练步骤, 定义了模型的一次迭代
1正向传递输出logits和softmax概率
2记录softmax概率以跟踪准确性
3将输出logits传递给损失函数以计算损失
4调用带损失函数的反向传递(反向传播错误)
5告诉优化器对网络参数应用一个更新步骤
6为下一次迭代将优化器中的累积梯度归零
"""
# define function to create a single step of the training phase
def make_train_step(model, criterion, optimizer):
# define the training step of the training phase
def train_step(X,Y):
# forward pass
output_logits, output_softmax = model(X)
predictions = torch.argmax(output_softmax,dim=1)
accuracy = torch.sum(Y==predictions)/float(len(Y))
# compute loss on logits because nn.CrossEntropyLoss implements log softmax
loss = criterion(output_logits, Y)
# compute gradients for the optimizer to use
loss.backward()
# update network parameters based on gradient stored (by calling loss.backward())
optimizer.step()
# zero out gradients for next pass
# pytorch accumulates gradients from backwards passes
optimizer.zero_grad()
return loss.item(), accuracy*100
return train_step
"""## Define Validation Step
定义一个函数, 在10%的X,y张量对上返回一个验证步骤, 以了解模型在训练时的泛化性, 以便确定是否以及何时停止它并调整超参数
通过将model设置为验证模式来确保在验证过程中不更新网络参数不要在验证阶段通过设置torch.no_grad()来浪费资源计算梯度
"""
def make_validate_fnc(model,criterion):
def validate(X,Y):
# don't want to update any network parameters on validation passes: don't need gradient
# wrap in torch.no_grad to save memory and compute in validation phase:
with torch.no_grad():
# set model to validation phase i.e. turn off dropout and batchnorm layers
model.eval()
# get the model's predictions on the validation set
output_logits, output_softmax = model(X)
predictions = torch.argmax(output_softmax,dim=1)
# calculate the mean accuracy over the entire validation set
accuracy = torch.sum(Y==predictions)/float(len(Y))
# compute error from logits (nn.crossentropy implements softmax)
loss = criterion(output_logits,Y)
return loss.item(), accuracy*100, predictions
return validate
"""# Make Checkpoint Functions
在每个epoch之后保存模型状态的检查点当对模型的性能感到满意时, 可以中断训练并加载适当的模型二进制文件
-硬件/软件故障恢复培训
-通过调整后从检查点进行训练来保存计算重新训练
-通过保持模型最高性能版本的快照, 轻松实现早期停止
"""
def make_save_checkpoint():
def save_checkpoint(optimizer, model, epoch, filename):
checkpoint_dict = {
'optimizer': optimizer.state_dict(),
'model': model.state_dict(),
'epoch': epoch
}
torch.save(checkpoint_dict, filename)
return save_checkpoint
def load_checkpoint(optimizer, model, filename):
checkpoint_dict = torch.load(filename)
epoch = checkpoint_dict['epoch']
model.load_state_dict(checkpoint_dict['model'])
if optimizer is not None:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
return epoch
"""# Build Training Loop
使用训练和验证步骤函数构建完整的训练循环
<br>
训练循环逻辑:
--Setup--
实例化 model.
实例化模型训练和验证步骤, loss function optimizer.
Move model to GPU.
--Epoch--
在每个epoch后验证阶段完成后, 将模型设置为训练模式
Shuffle 每个epoch的训练集, 重置epoch损失和精度
--Iteration--
维每次迭代创建 mini_batch X_train, y_train 张量, 并将张量移动到GPU
Take 1 train step with X_train, y_train minibatch tensors.
汇总每次迭代的准确性和损失, 但只在每个epoch之后记录
--Epoch--
计算并记录整个epoch的验证精度, 以跟踪学习进度
在每个epoch之后打印训练指标
"""
# get training set size to calculate # iterations and minibatch indices
train_size = X_train.shape[0]
# pick minibatch size (of 32... always)
minibatch = 32
# set device to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'{device} selected')
# instantiate model and move to GPU for training
model = FCN_Transformer(num_emotions=len(traffic_dict)).to(device)
print('Number of trainable params: ',sum(p.numel() for p in model.parameters()) )
# instantiate the checkpoint save function
save_checkpoint = make_save_checkpoint()
# instantiate the training step function
train_step = make_train_step(model, criterion, optimizer=optimizer)
# instantiate the validation loop function
validate = make_validate_fnc(model,criterion)
# instantiate lists to hold scalar performance metrics to plot later
train_losses=[]
valid_losses = []
# create training loop for one complete epoch (entire training set)
def train(optimizer, model, num_epochs, X_train, Y_train, X_valid, Y_valid):
for epoch in range(num_epochs):
# set model to train phase
model.train()
# shuffle entire training set in each epoch to randomize minibatch order
ind = np.random.permutation(train_size)
# shuffle the training set for each epoch:
X_train = X_train[ind,:,:,:]
Y_train = Y_train[ind]
# instantiate scalar values to keep track of progress after each epoch so we can stop training when appropriate
epoch_acc = 0
epoch_loss = 0
num_iterations = int(train_size / minibatch)
# create a loop for each minibatch of 32 samples:
for i in range(num_iterations):
# we have to track and update minibatch position for the current minibatch
# if we take a random batch position from a set, we almost certainly will skip some of the data in that set
# track minibatch position based on iteration number:
batch_start = i * minibatch
# ensure we don't go out of the bounds of our training set:
batch_end = min(batch_start + minibatch, train_size)
# ensure we don't have an index error
actual_batch_size = batch_end-batch_start
# get training minibatch with all channnels and 1D feature dims
X = X_train[batch_start:batch_end,:,:,:]
# get training minibatch labels
Y = Y_train[batch_start:batch_end]
# instantiate training tensors
X_tensor = torch.tensor(X, device=device).float()
Y_tensor = torch.tensor(Y, dtype=torch.long,device=device)
# Pass input tensors thru 1 training step (fwd+backwards pass)
loss, acc = train_step(X_tensor,Y_tensor)
# aggregate batch accuracy to measure progress of entire epoch
epoch_acc += acc * actual_batch_size / train_size
epoch_loss += loss * actual_batch_size / train_size
# keep track of the iteration to see if the model's too slow
print('\r'+f'Epoch {epoch}: iteration {i}/{num_iterations}',end='')
# create tensors from validation set
X_valid_tensor = torch.tensor(X_valid,device=device).float()
Y_valid_tensor = torch.tensor(Y_valid,dtype=torch.long,device=device)
# calculate validation metrics to keep track of progress; don't need predictions now
valid_loss, valid_acc, _ = validate(X_valid_tensor,Y_valid_tensor)
# accumulate scalar performance metrics at each epoch to track and plot later
train_losses.append(epoch_loss)
valid_losses.append(valid_loss)
# Save checkpoint of the model
checkpoint_filename = './checkpoints/FCN_TransformerFINAL-{:03d}.pkl'.format(epoch)
save_checkpoint(optimizer, model, epoch, checkpoint_filename)
# keep track of each epoch's progress
print(f'\nEpoch {epoch} --- loss:{epoch_loss:.3f}, Epoch accuracy:{epoch_acc:.2f}%, Validation loss:{valid_loss:.3f}, Validation accuracy:{valid_acc:.2f}%')
# choose number of epochs higher than reasonable so we can manually stop training
num_epochs = 100
# train it!
train(optimizer, model, num_epochs, X_train, y_train, X_valid, y_valid)
"""# Check the Loss Curve's Behaviour
Let's see if we missed something egregious during training.
"""
plt.title('Loss Curve for Model')
plt.ylabel('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=16)
plt.plot(train_losses[:],'b')
plt.plot(valid_losses[:],'r')
plt.legend(['Training loss','Validation loss'])
plt.show()
"""
# 加载训练好的模型进行验证
"""
# pick load folder
load_folder = './checkpoints'
# pick the epoch to load
epoch = '60'
model_name = f'FCN_TransformerFINAL-{epoch}.pkl'
# make full load path
load_path = os.path.join(load_folder, model_name)
## instantiate empty model and populate with params from binary
model = FCN_Transformer(len(traffic_dict))
load_checkpoint(optimizer, model, load_path)
print(f'Loaded model from {load_path}')
"""# Evaluate the Model on Hold-Out Test Set
"""
# reinitialize validation function with model from chosen checkpoint
validate = make_validate_fnc(model,criterion)
# Convert 4D test feature set array to tensor and move to GPU
X_test_tensor = torch.tensor(X_test,device=device).float()
# Convert 4D test label set array to tensor and move to GPU
y_test_tensor = torch.tensor(y_test,dtype=torch.long,device=device)
# Get the model's performance metrics using the validation function we defined
test_loss, test_acc, predicted_emotions = validate(X_test_tensor,y_test_tensor)
print(f'Test accuracy is {test_acc:.2f}%')
"""
# Analyze Performance on Test Set
"""
from sklearn.metrics import confusion_matrix
import seaborn as sn
# because model tested on GPU, move prediction tensor to CPU then convert to array
predicted_emotions = predicted_emotions.cpu().numpy()
# use labels from test set
emotions_groundtruth = y_test
# build confusion matrix and normalized confusion matrix
conf_matrix = confusion_matrix(emotions_groundtruth, predicted_emotions)
conf_matrix_norm = confusion_matrix(emotions_groundtruth, predicted_emotions,normalize='true')
# set labels for matrix axes from emotions
emotion_names = [emotion for emotion in traffic_dict.values()]
# make a confusion matrix with labels using a DataFrame
confmatrix_df = pd.DataFrame(conf_matrix, index=emotion_names, columns=emotion_names)
confmatrix_df_norm = pd.DataFrame(conf_matrix_norm, index=emotion_names, columns=emotion_names)
# plot confusion matrices
plt.figure(figsize=(16,6))
sn.set(font_scale=1.8) # emotion label and title size
plt.subplot(1,2,1)
plt.title('Confusion Matrix')
sn.heatmap(confmatrix_df, annot=True, annot_kws={"size": 18}) #annot_kws is value font
plt.subplot(1,2,2)
plt.title('Normalized Confusion Matrix')
sn.heatmap(confmatrix_df_norm, annot=True, annot_kws={"size": 13}) #annot_kws is value font
plt.show()

View File

@ -0,0 +1,14 @@
2013-11-09 SplitCap 2.1
* Support for reading PCAP data from stdin with "-r -", for example in order to run:
tcpdump -i eth0 -w - | mono SplitCap.exe -r -
2013-06-25 SplitCap 2.0
* Changed from "\\" to System.IO.Path.DirectorySeparatorChar in order to run better
on Linux and other non-Windows platforms.
Hint: use Mono framework to run SplitCap in Linux.
Installation in Ubuntu with: apt-get install libmono2.0-cil
* Added "-s seconds <s>" and "-s packets <c>" to split pcap files based on
time or packet count (much like editcap).

View File

@ -0,0 +1,19 @@
Copyright 2008-2011, Erik Hjelmvik <erik.hjelmvik[at]gmail.com>
SplitCap is available from http://www.netresec.com/?page=SplitCap
SOFTWARE LICENSE
SplitCap is licensed under the GNU General Public License Version 3.
http://www.gnu.org/licenses/gpl.html
SplitCap uses the assemblies PacketParser and PcapFileHandler, which both stem
from the NetworkMiner open source network forensics application available at:
http://networkminer.sourceforge.net/
The SplitCap open source project space is available on SourceForge:
http://sourceforge.net/projects/splitcap/
SplitCap was initially created as part of the Statistical Protocol IDentification
research project carried out by Erik Hjelmvik with fundings from .SE.
More info on .SE is available at: http://www.iis.se/en/

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,55 @@
# Author: @vinesmsuic
#
#
import os
import shutil
from tqdm import tqdm
import random
import argparse
def parser():
parser = argparse.ArgumentParser(description="Copying files")
parser.add_argument("--limit", type=int, required=False, default=-1, help="only copy a number of files each folder")
return parser.parse_args()
def main():
args = parser()
src_dir = os.path.join('2_Flow','AllLayers')
dst_dir = os.path.join('2_Flow_Processed','AllLayers')
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
folders = os.listdir(src_dir)
for folder in folders:
if not os.path.exists(os.path.join(dst_dir,folder)):
os.makedirs(os.path.join(dst_dir,folder))
if(os.path.isdir(os.path.join(dst_dir,folder))):
print("Now Processing Folder: ", folder)
copying_folders = os.listdir(os.path.join(src_dir,folder))
random.seed(72)
random.shuffle(copying_folders)
if(args.limit!= -1):
if(len(copying_folders) > args.limit):
copying_folders = copying_folders[:args.limit]
elif(len(copying_folders) < args.limit):
print("Folder "+str(folder), "does not have required "+str(args.limit) + "files. Folder only has "+str(len(copying_folders)) + " files.")
for f in tqdm(copying_folders):
full_file_name = os.path.join(src_dir, folder, f)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, os.path.join(dst_dir, folder))
if __name__ == '__main__':
main()

Binary file not shown.

View File

@ -0,0 +1,17 @@
foreach($f in gci 1_Pcap *.pcap)
{
echo "Now processing file : $f"
0_Tool\SplitCap_2-1\SplitCap -p 100000 -b 100000 -r $f.FullName -s flow -o 2_Flow\AllLayers\$($f.BaseName)-ALL
#0_Tool\SplitCap_2-1\SplitCap -p 100000 -b 100000 -r $f.FullName -s flow -o 2_Flow\L7\$($f.BaseName)-L7 -y L7
echo "Done Spliting! Now Clearing 0KB size files..."
# Delete pcap files length equal to 0
gci 2_Flow\AllLayers\$($f.BaseName)-ALL | ?{$_.Length -eq 0} | del
echo "-------------------------------------------------"
}
echo "Now Eliminating duplicate flows..."
# Eliminate duplicate Flows
0_Tool\finddupe -del 2_Flow\AllLayers
echo "-------------------------------------------------"
echo "Finished"

View File

Some files were not shown because too many files have changed in this diff Show More