160 lines
5.1 KiB
Python
160 lines
5.1 KiB
Python
|
# Author: @vinesmsuic
|
||
|
#
|
||
|
#
|
||
|
|
||
|
import dpkt
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
import os
|
||
|
from tqdm import tqdm
|
||
|
import argparse
|
||
|
import random
|
||
|
|
||
|
def parser():
|
||
|
parser = argparse.ArgumentParser(description="Selecting Parameter of Packets and Bytes.")
|
||
|
parser.add_argument("--packet", type=int, required=True, help="number of required packets")
|
||
|
parser.add_argument("--byte", type=int, required=True, help="number of trimmed byte")
|
||
|
parser.add_argument("--limit", type=int, required=False, default=-1, help="only extract packets from the largest N flows")
|
||
|
return parser.parse_args()
|
||
|
|
||
|
# Sanitization
|
||
|
def zero_mask_packet(eth_packet):
|
||
|
|
||
|
#Mask MAC Address to 00:00:00:00:00:00
|
||
|
eth_packet.src = b'\x00\x00\x00\x00\x00\x00'
|
||
|
eth_packet.dst = b'\x00\x00\x00\x00\x00\x00'
|
||
|
|
||
|
if(eth_packet.data.__class__.__name__ == 'IP6'):
|
||
|
#Mask IPv6 Address to 0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0
|
||
|
eth_packet.data.src = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
|
||
|
eth_packet.data.dst = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
|
||
|
else:
|
||
|
#Mask IPv4 Address to 0.0.0.0
|
||
|
eth_packet.data.src = b'\x00\x00\x00\x00'
|
||
|
eth_packet.data.dst = b'\x00\x00\x00\x00'
|
||
|
|
||
|
return eth_packet
|
||
|
|
||
|
# Extract Information
|
||
|
def get_packets(pcap, packetnum, target_length):
|
||
|
|
||
|
r_num = 0
|
||
|
packetlist = []
|
||
|
|
||
|
# For each packet in the pcap process the contents
|
||
|
for ts, buf in pcap:
|
||
|
|
||
|
r_num += 1
|
||
|
|
||
|
eth_packet = dpkt.ethernet.Ethernet(buf)
|
||
|
eth_packet = zero_mask_packet(eth_packet)
|
||
|
|
||
|
byte_buf = bytes(eth_packet)
|
||
|
|
||
|
trimmed_buf = trimming(byte_buf, target_length=target_length)
|
||
|
|
||
|
packetlist.append(trimmed_buf)
|
||
|
|
||
|
if(r_num == packetnum):
|
||
|
break
|
||
|
|
||
|
# If number of packets is lesser than our requirements, pad a whole packet of zeros
|
||
|
if(r_num < packetnum):
|
||
|
for _ in range(packetnum - r_num):
|
||
|
paddings = bytes(target_length)
|
||
|
packetlist.append(paddings)
|
||
|
|
||
|
return packetlist
|
||
|
|
||
|
|
||
|
def trimming(byte, target_length):
|
||
|
|
||
|
# Appending zeros in a packet if byte length < target length
|
||
|
if(len(byte) < target_length):
|
||
|
needed_length = target_length - len(byte)
|
||
|
zeros = bytearray(needed_length)
|
||
|
return (byte+zeros)
|
||
|
|
||
|
# Trim byte in a packet if byte length > target length
|
||
|
elif(len(byte) > target_length):
|
||
|
return (byte[:target_length])
|
||
|
|
||
|
# Else byte length = target length. Do nothing.
|
||
|
else:
|
||
|
return byte
|
||
|
|
||
|
|
||
|
def packet_from_file(file, packetnum, target_length):
|
||
|
with open(file, 'rb') as f:
|
||
|
pcap = dpkt.pcap.Reader(f)
|
||
|
packets = get_packets(pcap, packetnum=packetnum, target_length=target_length)
|
||
|
#print(np.shape(packets))
|
||
|
return packets
|
||
|
|
||
|
|
||
|
def main():
|
||
|
args = parser()
|
||
|
|
||
|
directory = os.path.join('2_Flow', 'AllLayers')
|
||
|
|
||
|
for folder in os.listdir(directory):
|
||
|
|
||
|
if(os.path.isdir(os.path.join(directory,folder))):
|
||
|
|
||
|
|
||
|
print("Now Processing Folder: ", folder)
|
||
|
|
||
|
# Create Dataframe Object
|
||
|
folder_df = pd.DataFrame(columns = ['Path','Bytes'])
|
||
|
|
||
|
searching_folders = os.listdir(os.path.join(directory,folder))
|
||
|
|
||
|
#####################################
|
||
|
# TODO:
|
||
|
#
|
||
|
#
|
||
|
##if(args.sort == 1):
|
||
|
#Sort the searching folders by size (largerst to smallest)
|
||
|
##searching_folders = sorted(searching_folders, key=lambda f: os.path.getsize(os.path.abspath(os.path.join(directory, folder, f))), reverse=True)
|
||
|
##else:
|
||
|
random.seed(72)
|
||
|
random.shuffle(searching_folders)
|
||
|
|
||
|
######################################
|
||
|
|
||
|
|
||
|
if(args.limit!= -1):
|
||
|
if(len(searching_folders) > args.limit):
|
||
|
searching_folders = searching_folders[:args.limit]
|
||
|
elif(len(searching_folders) < args.limit):
|
||
|
print("Folder "+str(folder), "does not have required "+str(args.limit) + "files. Folder only has "+str(len(searching_folders)) + " files.")
|
||
|
|
||
|
|
||
|
|
||
|
for f in tqdm(searching_folders):
|
||
|
|
||
|
if f.endswith(".pcap"):
|
||
|
# print(os.path.join(directory, folder, f))
|
||
|
path_to_file = os.path.join(directory, folder, f)
|
||
|
packets = packet_from_file(path_to_file, packetnum=args.packet, target_length=args.byte)
|
||
|
# Create Dataframe Object
|
||
|
folder_df = folder_df.append({'Path' : path_to_file, 'Bytes' : packets}, ignore_index = True)
|
||
|
|
||
|
continue
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
print("Row entries of "+str(folder)+": ",folder_df.shape[0])
|
||
|
|
||
|
save_path = os.path.join('3_Packet' , folder)+"-p"+str(args.packet)+"-b"+str(args.byte)+"-l"+str(abs(args.limit))+".pkl"
|
||
|
|
||
|
folder_df.to_pickle(save_path)
|
||
|
print("Saved to file: ", save_path)
|
||
|
print("-"*20)
|
||
|
|
||
|
continue
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|