Graduation_Project/LYZ/b.py

360 lines
15 KiB
Python
Raw Normal View History

2024-06-29 14:23:18 +08:00
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, glob
import warnings; warnings.filterwarnings('ignore') #matplot lib complains about librosa
# RAVDESS native sample rate is 48k
sample_rate = 48000
# Mel Spectrograms are not directly used as a feature in this model
# Mel Spectrograms are used in calculating MFCCs, which are a higher-level representation of pitch transition
# MFCCs work better - left the mel spectrogram function here in case anyone wants to experiment
def feature_melspectrogram(
waveform,
sample_rate,
fft = 1024,
winlen = 512,
window='hamming',
hop=256,
mels=128,
):
# Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
# Using 8khz as upper frequency bound should be enough for most speech classification tasks
melspectrogram = librosa.feature.melspectrogram(
y=waveform,
sr=sample_rate,
n_fft=fft,
win_length=winlen,
window=window,
hop_length=hop,
n_mels=mels,
fmax=sample_rate/2)
# convert from power (amplitude**2) to decibels
# necessary for network to learn - doesn't converge with raw power spectrograms
melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
return melspectrogram
def feature_mfcc(
waveform,
sample_rate,
n_mfcc = 40,
fft = 1024,
winlen = 512,
window='hamming',
#hop=256, # increases # of time steps; was not helpful
mels=128
):
# Compute the MFCCs for all STFT frames
# 40 mel filterbanks (n_mfcc) = 40 coefficients
mfc_coefficients=librosa.feature.mfcc(
y=waveform,
sr=sample_rate,
n_mfcc=n_mfcc,
n_fft=fft,
win_length=winlen,
window=window,
#hop_length=hop,
n_mels=mels,
fmax=sample_rate/2
)
return mfc_coefficients
def get_features(waveforms, features, samplerate):
# initialize counter to track progress
file_count = 0
# process each waveform individually to get its MFCCs
for waveform in waveforms:
mfccs = feature_mfcc(waveform, sample_rate)
features.append(mfccs)
file_count += 1
# print progress
print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')
# return all features from list of waveforms
return features
def get_waveforms(file):
# load an individual sample audio file
# read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
# don't need to store the sample rate that librosa.load returns
waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
# make sure waveform vectors are homogenous by defining explicitly
waveform_homo = np.zeros((int(sample_rate*3,)))
waveform_homo[:len(waveform)] = waveform
# return a single file's waveform
return waveform_homo
# RAVDESS dataset emotions
# shift emotions left to be 0 indexed for PyTorch
emotions_dict ={
'0':'surprised',
'1':'neutral',
'2':'calm',
'3':'happy',
'4':'sad',
'5':'angry',
'6':'fearful',
'7':'disgust'
}
# Additional attributes from RAVDESS to play with
emotion_attributes = {
'01': 'normal',
'02': 'strong'
}
class parallel_all_you_want(nn.Module):
# Define all layers present in the network
def __init__(self,num_emotions):
super().__init__()
################ TRANSFORMER BLOCK #############################
# maxpool the input feature map/tensor to the transformer
# a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
# define single transformer encoder layer
# self-attention + feedforward network from "Attention is All You Need" paper
# 4 multi-head self-attention layers each with 64-->512--->64 feedforward network
transformer_layer = nn.TransformerEncoderLayer(
d_model=40, # input feature (frequency) dim after maxpooling 128*563 -> 64*140 (freq*time)
nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64
dropout=0.4,
activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
)
# I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
# Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
self.conv2Dblock1 = nn.Sequential(
# 1st 2D convolution layer
nn.Conv2d(
in_channels=1, # input volume depth == input channel dim == 1
out_channels=16, # expand output feature map volume's depth to 16
kernel_size=3, # typical 3*3 stride 1 kernel
stride=1,
padding=1
),
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
nn.ReLU(), # feature map --> activation map
nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
nn.Conv2d(
in_channels=16,
out_channels=32, # expand output feature map volume's depth to 32
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
nn.Dropout(p=0.3),
# 3rd 2D convolution layer identical to last except output dim
nn.Conv2d(
in_channels=32,
out_channels=64, # expand output feature map volume's depth to 64
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
)
############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
self.conv2Dblock2 = nn.Sequential(
# 1st 2D convolution layer
nn.Conv2d(
in_channels=1, # input volume depth == input channel dim == 1
out_channels=16, # expand output feature map volume's depth to 16
kernel_size=3, # typical 3*3 stride 1 kernel
stride=1,
padding=1
),
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
nn.ReLU(), # feature map --> activation map
nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
nn.Conv2d(
in_channels=16,
out_channels=32, # expand output feature map volume's depth to 32
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
nn.Dropout(p=0.3),
# 3rd 2D convolution layer identical to last except output dim
nn.Conv2d(
in_channels=32,
out_channels=64, # expand output feature map volume's depth to 64
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
)
################# FINAL LINEAR BLOCK ####################
# Linear softmax layer to take final concatenated embedding tensor
# from parallel 2D convolutional and transformer blocks, output 8 logits
# Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array
# Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
# 512*2+40 == 1064 input features --> 8 output emotions
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
### Softmax layer for the 8 output logits from final FC linear layer
self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
def forward(self,x):
############ 1st parallel Conv2D block: 4 Convolutional layers ############################
# create final feature embedding from 1st convolutional layer
# input features pased through 4 sequential 2D convolutional layers
conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
# skip the 1st (N/batch) dimension when flattening
conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)
############ 2nd parallel Conv2D block: 4 Convolutional layers #############################
# create final feature embedding from 2nd convolutional layer
# input features pased through 4 sequential 2D convolutional layers
conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
# skip the 1st (N/batch) dimension when flattening
conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)
########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ##############
# maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70
x_maxpool = self.transformer_maxpool(x)
# remove channel dim: 1*40*70 --> 40*70
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
x = x_maxpool_reduced.permute(2,0,1)
# finally, pass reduced input feature map x into transformer encoder layers
transformer_output = self.transformer_encoder(x)
# create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
# transformer outputs 64*140 (freq embedding*time) feature map, take mean of all columns i.e. take time average
transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
############# concatenate freq embeddings from convolutional and transformer blocks ######
# concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)
######### final FC linear layer, need logits for loss #########################
output_logits = self.fc1_linear(complete_embedding)
######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######
output_softmax = self.softmax_out(output_logits)
# need output logits to compute cross entropy loss, need softmax probabilities to predict class
return output_logits, output_softmax
"""## 网络张量分析
We zero-pad 1 the input feature map to each convolutional layer to get back from the layer the same shape tensor as we input: zero-pad 1 adds 2 to each of (H, W) dims, and the 3x3, stride 1 kernels cuts off (kernel - stride == 2) dims from each of (H,W). **Zero-pad 1 --> 3x3 stride 1 kernel effectively throws away the zero pads to get same input/output shape from each conv2D block.**
At the end of first convolutional layer in each block we have a maxpool kernel of size 2x2, stride 2 which will take 1 of 4 pixels in its winddow. For the input feature map the maxpool kernel will progress 128/2 = 64 times over the rows and 563/2=281 times over the columns. **Nonoverlapping maxpool kernel reduces each output dim to input dim/kernel size.** We then expand the output channels to 16 making an output feature map of (16x64x281).
The next two convolutional layers in each block have a maxpool kernel size 8x8, stride 8. Same math as above, maxpool reduces each dim/8. 2nd conv layer takes (16x64x281) --> (32x8x35). 3rd and final conv layer takes (32x8x35) --> (64x1x4).
**Note that in (N,C,H,W) format, for MFCCs H = MFCC (pitch), W = time step.**
**Complete flow through each convolutional block (C,H,W):**
Layer 1 ---> 1x128x563 --> PAD-1 --> 1x130x565 --> FILTER --> 16x128x563 --> MAXPOOL 2x2 stride 2 --> 16x64x281
Layer 2 ---> 16x64x281 --> PAD-1 --> 16x66x283 --> FILTER --> 32x64x281 --> MAXPOOL 8x8 stride 8 --> 32x8x35
Layer 3 ---> 32x8x35 --> PAD-1 --> 32x10x37 --> FILTER --> 64x8x35 --> MAXPOOL 8x8 stride 8 --> 64x1x4
Flatten ---> 64x1x4 --> Final convolutional embedding length 256 1D array
**Complete flow through transformer encoder block (C,H,W):**
Maxpool 2x4 stride 2x4 ---> 1x128x563 --> 1x64x140
Drop channel ---> 1x64x140 --> 64x140 (H,W)
Change dims ---> 64x140 --> 140x64 (W,H)
4*Transformer encoder ---> 140x64 --> 2x64 (W,H)
Time average ---> 2x64 --> 1x64 --> Final transformer embedding length 64 1D array
**FC Linear network (C,H,W):**
Concatenate ---> 256x256x64 --> 576
FC Linear layer ---> 576 --> Final linear logits output length 8 1D array
Softmax layer: 8 ----> 1 predicted emotion / max probability class
We can confirm our network's tensor shapes and flow using the excellent torchsummary package which provides a PyTorch implementation of Keras' model.summary method:
"""
from torchsummary import summary
# need device to instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# instantiate model for 8 emotions and move to CPU for summary
model = parallel_all_you_want(len(emotions_dict)).to(device)
data = torch.randint(255, size=(2, 1, 40, 282)).float()
print("data: ", data.type())
model(data)
# include input feature map dims in call to summary()
summary(model, input_size=(1,40,282))