360 lines
15 KiB
Python
360 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pandas as pd
|
|
import os, glob
|
|
import warnings; warnings.filterwarnings('ignore') #matplot lib complains about librosa
|
|
|
|
|
|
# RAVDESS native sample rate is 48k
|
|
sample_rate = 48000
|
|
|
|
# Mel Spectrograms are not directly used as a feature in this model
|
|
# Mel Spectrograms are used in calculating MFCCs, which are a higher-level representation of pitch transition
|
|
# MFCCs work better - left the mel spectrogram function here in case anyone wants to experiment
|
|
def feature_melspectrogram(
|
|
waveform,
|
|
sample_rate,
|
|
fft = 1024,
|
|
winlen = 512,
|
|
window='hamming',
|
|
hop=256,
|
|
mels=128,
|
|
):
|
|
|
|
# Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
|
|
# Using 8khz as upper frequency bound should be enough for most speech classification tasks
|
|
melspectrogram = librosa.feature.melspectrogram(
|
|
y=waveform,
|
|
sr=sample_rate,
|
|
n_fft=fft,
|
|
win_length=winlen,
|
|
window=window,
|
|
hop_length=hop,
|
|
n_mels=mels,
|
|
fmax=sample_rate/2)
|
|
|
|
# convert from power (amplitude**2) to decibels
|
|
# necessary for network to learn - doesn't converge with raw power spectrograms
|
|
melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
|
|
|
|
return melspectrogram
|
|
|
|
def feature_mfcc(
|
|
waveform,
|
|
sample_rate,
|
|
n_mfcc = 40,
|
|
fft = 1024,
|
|
winlen = 512,
|
|
window='hamming',
|
|
#hop=256, # increases # of time steps; was not helpful
|
|
mels=128
|
|
):
|
|
|
|
# Compute the MFCCs for all STFT frames
|
|
# 40 mel filterbanks (n_mfcc) = 40 coefficients
|
|
mfc_coefficients=librosa.feature.mfcc(
|
|
y=waveform,
|
|
sr=sample_rate,
|
|
n_mfcc=n_mfcc,
|
|
n_fft=fft,
|
|
win_length=winlen,
|
|
window=window,
|
|
#hop_length=hop,
|
|
n_mels=mels,
|
|
fmax=sample_rate/2
|
|
)
|
|
|
|
return mfc_coefficients
|
|
|
|
def get_features(waveforms, features, samplerate):
|
|
|
|
# initialize counter to track progress
|
|
file_count = 0
|
|
|
|
# process each waveform individually to get its MFCCs
|
|
for waveform in waveforms:
|
|
mfccs = feature_mfcc(waveform, sample_rate)
|
|
features.append(mfccs)
|
|
file_count += 1
|
|
# print progress
|
|
print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')
|
|
|
|
# return all features from list of waveforms
|
|
return features
|
|
|
|
def get_waveforms(file):
|
|
|
|
# load an individual sample audio file
|
|
# read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
|
|
# don't need to store the sample rate that librosa.load returns
|
|
waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
|
|
|
|
# make sure waveform vectors are homogenous by defining explicitly
|
|
waveform_homo = np.zeros((int(sample_rate*3,)))
|
|
waveform_homo[:len(waveform)] = waveform
|
|
|
|
# return a single file's waveform
|
|
return waveform_homo
|
|
|
|
# RAVDESS dataset emotions
|
|
# shift emotions left to be 0 indexed for PyTorch
|
|
emotions_dict ={
|
|
'0':'surprised',
|
|
'1':'neutral',
|
|
'2':'calm',
|
|
'3':'happy',
|
|
'4':'sad',
|
|
'5':'angry',
|
|
'6':'fearful',
|
|
'7':'disgust'
|
|
}
|
|
|
|
# Additional attributes from RAVDESS to play with
|
|
emotion_attributes = {
|
|
'01': 'normal',
|
|
'02': 'strong'
|
|
}
|
|
|
|
class parallel_all_you_want(nn.Module):
|
|
# Define all layers present in the network
|
|
def __init__(self,num_emotions):
|
|
super().__init__()
|
|
|
|
################ TRANSFORMER BLOCK #############################
|
|
# maxpool the input feature map/tensor to the transformer
|
|
# a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
|
|
self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
|
|
|
|
# define single transformer encoder layer
|
|
# self-attention + feedforward network from "Attention is All You Need" paper
|
|
# 4 multi-head self-attention layers each with 64-->512--->64 feedforward network
|
|
transformer_layer = nn.TransformerEncoderLayer(
|
|
d_model=40, # input feature (frequency) dim after maxpooling 128*563 -> 64*140 (freq*time)
|
|
nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
|
|
dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64
|
|
dropout=0.4,
|
|
activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
|
|
)
|
|
|
|
# I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
|
|
# Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
|
|
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
|
|
|
|
############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############
|
|
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
|
|
self.conv2Dblock1 = nn.Sequential(
|
|
|
|
# 1st 2D convolution layer
|
|
nn.Conv2d(
|
|
in_channels=1, # input volume depth == input channel dim == 1
|
|
out_channels=16, # expand output feature map volume's depth to 16
|
|
kernel_size=3, # typical 3*3 stride 1 kernel
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
|
|
nn.ReLU(), # feature map --> activation map
|
|
nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
|
|
nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
|
|
|
|
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
|
|
nn.Conv2d(
|
|
in_channels=16,
|
|
out_channels=32, # expand output feature map volume's depth to 32
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(32),
|
|
nn.ReLU(),
|
|
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
|
|
nn.Dropout(p=0.3),
|
|
|
|
# 3rd 2D convolution layer identical to last except output dim
|
|
nn.Conv2d(
|
|
in_channels=32,
|
|
out_channels=64, # expand output feature map volume's depth to 64
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(64),
|
|
nn.ReLU(),
|
|
nn.MaxPool2d(kernel_size=4, stride=4),
|
|
nn.Dropout(p=0.3),
|
|
)
|
|
############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############
|
|
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
|
|
self.conv2Dblock2 = nn.Sequential(
|
|
|
|
# 1st 2D convolution layer
|
|
nn.Conv2d(
|
|
in_channels=1, # input volume depth == input channel dim == 1
|
|
out_channels=16, # expand output feature map volume's depth to 16
|
|
kernel_size=3, # typical 3*3 stride 1 kernel
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
|
|
nn.ReLU(), # feature map --> activation map
|
|
nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
|
|
nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
|
|
|
|
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
|
|
nn.Conv2d(
|
|
in_channels=16,
|
|
out_channels=32, # expand output feature map volume's depth to 32
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(32),
|
|
nn.ReLU(),
|
|
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
|
|
nn.Dropout(p=0.3),
|
|
|
|
# 3rd 2D convolution layer identical to last except output dim
|
|
nn.Conv2d(
|
|
in_channels=32,
|
|
out_channels=64, # expand output feature map volume's depth to 64
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1
|
|
),
|
|
nn.BatchNorm2d(64),
|
|
nn.ReLU(),
|
|
nn.MaxPool2d(kernel_size=4, stride=4),
|
|
nn.Dropout(p=0.3),
|
|
)
|
|
|
|
################# FINAL LINEAR BLOCK ####################
|
|
# Linear softmax layer to take final concatenated embedding tensor
|
|
# from parallel 2D convolutional and transformer blocks, output 8 logits
|
|
# Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array
|
|
# Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
|
|
# 512*2+40 == 1064 input features --> 8 output emotions
|
|
self.fc1_linear = nn.Linear(512*2+40,num_emotions)
|
|
|
|
### Softmax layer for the 8 output logits from final FC linear layer
|
|
self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
|
|
|
|
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
|
|
def forward(self,x):
|
|
|
|
############ 1st parallel Conv2D block: 4 Convolutional layers ############################
|
|
# create final feature embedding from 1st convolutional layer
|
|
# input features pased through 4 sequential 2D convolutional layers
|
|
conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
|
|
|
|
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
|
|
# skip the 1st (N/batch) dimension when flattening
|
|
conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)
|
|
|
|
############ 2nd parallel Conv2D block: 4 Convolutional layers #############################
|
|
# create final feature embedding from 2nd convolutional layer
|
|
# input features pased through 4 sequential 2D convolutional layers
|
|
conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
|
|
|
|
# flatten final 64*1*4 feature map from convolutional layers to length 256 1D array
|
|
# skip the 1st (N/batch) dimension when flattening
|
|
conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)
|
|
|
|
|
|
########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ##############
|
|
# maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70
|
|
x_maxpool = self.transformer_maxpool(x)
|
|
|
|
# remove channel dim: 1*40*70 --> 40*70
|
|
x_maxpool_reduced = torch.squeeze(x_maxpool,1)
|
|
|
|
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
|
|
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
|
|
x = x_maxpool_reduced.permute(2,0,1)
|
|
|
|
# finally, pass reduced input feature map x into transformer encoder layers
|
|
transformer_output = self.transformer_encoder(x)
|
|
|
|
# create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
|
|
# transformer outputs 64*140 (freq embedding*time) feature map, take mean of all columns i.e. take time average
|
|
transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
|
|
|
|
############# concatenate freq embeddings from convolutional and transformer blocks ######
|
|
# concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
|
|
complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)
|
|
|
|
######### final FC linear layer, need logits for loss #########################
|
|
output_logits = self.fc1_linear(complete_embedding)
|
|
|
|
######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######
|
|
output_softmax = self.softmax_out(output_logits)
|
|
|
|
# need output logits to compute cross entropy loss, need softmax probabilities to predict class
|
|
return output_logits, output_softmax
|
|
|
|
"""## 网络张量分析
|
|
We zero-pad 1 the input feature map to each convolutional layer to get back from the layer the same shape tensor as we input: zero-pad 1 adds 2 to each of (H, W) dims, and the 3x3, stride 1 kernels cuts off (kernel - stride == 2) dims from each of (H,W). **Zero-pad 1 --> 3x3 stride 1 kernel effectively throws away the zero pads to get same input/output shape from each conv2D block.**
|
|
|
|
At the end of first convolutional layer in each block we have a maxpool kernel of size 2x2, stride 2 which will take 1 of 4 pixels in its winddow. For the input feature map the maxpool kernel will progress 128/2 = 64 times over the rows and 563/2=281 times over the columns. **Nonoverlapping maxpool kernel reduces each output dim to input dim/kernel size.** We then expand the output channels to 16 making an output feature map of (16x64x281).
|
|
|
|
The next two convolutional layers in each block have a maxpool kernel size 8x8, stride 8. Same math as above, maxpool reduces each dim/8. 2nd conv layer takes (16x64x281) --> (32x8x35). 3rd and final conv layer takes (32x8x35) --> (64x1x4).
|
|
|
|
**Note that in (N,C,H,W) format, for MFCCs H = MFCC (pitch), W = time step.**
|
|
|
|
**Complete flow through each convolutional block (C,H,W):**
|
|
|
|
Layer 1 ---> 1x128x563 --> PAD-1 --> 1x130x565 --> FILTER --> 16x128x563 --> MAXPOOL 2x2 stride 2 --> 16x64x281
|
|
|
|
Layer 2 ---> 16x64x281 --> PAD-1 --> 16x66x283 --> FILTER --> 32x64x281 --> MAXPOOL 8x8 stride 8 --> 32x8x35
|
|
|
|
Layer 3 ---> 32x8x35 --> PAD-1 --> 32x10x37 --> FILTER --> 64x8x35 --> MAXPOOL 8x8 stride 8 --> 64x1x4
|
|
|
|
Flatten ---> 64x1x4 --> Final convolutional embedding length 256 1D array
|
|
|
|
|
|
**Complete flow through transformer encoder block (C,H,W):**
|
|
|
|
Maxpool 2x4 stride 2x4 ---> 1x128x563 --> 1x64x140
|
|
|
|
Drop channel ---> 1x64x140 --> 64x140 (H,W)
|
|
|
|
Change dims ---> 64x140 --> 140x64 (W,H)
|
|
|
|
4*Transformer encoder ---> 140x64 --> 2x64 (W,H)
|
|
|
|
Time average ---> 2x64 --> 1x64 --> Final transformer embedding length 64 1D array
|
|
|
|
**FC Linear network (C,H,W):**
|
|
|
|
Concatenate ---> 256x256x64 --> 576
|
|
|
|
FC Linear layer ---> 576 --> Final linear logits output length 8 1D array
|
|
|
|
Softmax layer: 8 ----> 1 predicted emotion / max probability class
|
|
|
|
|
|
We can confirm our network's tensor shapes and flow using the excellent torchsummary package which provides a PyTorch implementation of Keras' model.summary method:
|
|
"""
|
|
|
|
from torchsummary import summary
|
|
|
|
# need device to instantiate model
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
# instantiate model for 8 emotions and move to CPU for summary
|
|
model = parallel_all_you_want(len(emotions_dict)).to(device)
|
|
|
|
|
|
data = torch.randint(255, size=(2, 1, 40, 282)).float()
|
|
print("data: ", data.type())
|
|
model(data)
|
|
|
|
# include input feature map dims in call to summary()
|
|
summary(model, input_size=(1,40,282))
|
|
|
|
|