# -*- coding: utf-8 -*- import torch import torch.nn as nn import matplotlib.pyplot as plt import numpy as np import pandas as pd import os, glob import warnings; warnings.filterwarnings('ignore') #matplot lib complains about librosa # RAVDESS native sample rate is 48k sample_rate = 48000 # Mel Spectrograms are not directly used as a feature in this model # Mel Spectrograms are used in calculating MFCCs, which are a higher-level representation of pitch transition # MFCCs work better - left the mel spectrogram function here in case anyone wants to experiment def feature_melspectrogram( waveform, sample_rate, fft = 1024, winlen = 512, window='hamming', hop=256, mels=128, ): # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array # Using 8khz as upper frequency bound should be enough for most speech classification tasks melspectrogram = librosa.feature.melspectrogram( y=waveform, sr=sample_rate, n_fft=fft, win_length=winlen, window=window, hop_length=hop, n_mels=mels, fmax=sample_rate/2) # convert from power (amplitude**2) to decibels # necessary for network to learn - doesn't converge with raw power spectrograms melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max) return melspectrogram def feature_mfcc( waveform, sample_rate, n_mfcc = 40, fft = 1024, winlen = 512, window='hamming', #hop=256, # increases # of time steps; was not helpful mels=128 ): # Compute the MFCCs for all STFT frames # 40 mel filterbanks (n_mfcc) = 40 coefficients mfc_coefficients=librosa.feature.mfcc( y=waveform, sr=sample_rate, n_mfcc=n_mfcc, n_fft=fft, win_length=winlen, window=window, #hop_length=hop, n_mels=mels, fmax=sample_rate/2 ) return mfc_coefficients def get_features(waveforms, features, samplerate): # initialize counter to track progress file_count = 0 # process each waveform individually to get its MFCCs for waveform in waveforms: mfccs = feature_mfcc(waveform, sample_rate) features.append(mfccs) file_count += 1 # print progress print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='') # return all features from list of waveforms return features def get_waveforms(file): # load an individual sample audio file # read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k # don't need to store the sample rate that librosa.load returns waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate) # make sure waveform vectors are homogenous by defining explicitly waveform_homo = np.zeros((int(sample_rate*3,))) waveform_homo[:len(waveform)] = waveform # return a single file's waveform return waveform_homo # RAVDESS dataset emotions # shift emotions left to be 0 indexed for PyTorch emotions_dict ={ '0':'surprised', '1':'neutral', '2':'calm', '3':'happy', '4':'sad', '5':'angry', '6':'fearful', '7':'disgust' } # Additional attributes from RAVDESS to play with emotion_attributes = { '01': 'normal', '02': 'strong' } class parallel_all_you_want(nn.Module): # Define all layers present in the network def __init__(self,num_emotions): super().__init__() ################ TRANSFORMER BLOCK ############################# # maxpool the input feature map/tensor to the transformer # a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4]) # define single transformer encoder layer # self-attention + feedforward network from "Attention is All You Need" paper # 4 multi-head self-attention layers each with 64-->512--->64 feedforward network transformer_layer = nn.TransformerEncoderLayer( d_model=40, # input feature (frequency) dim after maxpooling 128*563 -> 64*140 (freq*time) nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64 dropout=0.4, activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time ) # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward) self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4) ############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############ # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8) self.conv2Dblock1 = nn.Sequential( # 1st 2D convolution layer nn.Conv2d( in_channels=1, # input volume depth == input channel dim == 1 out_channels=16, # expand output feature map volume's depth to 16 kernel_size=3, # typical 3*3 stride 1 kernel stride=1, padding=1 ), nn.BatchNorm2d(16), # batch normalize the output feature map before activation nn.ReLU(), # feature map --> activation map nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training # 2nd 2D convolution layer identical to last except output dim, maxpool kernel nn.Conv2d( in_channels=16, out_channels=32, # expand output feature map volume's depth to 32 kernel_size=3, stride=1, padding=1 ), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters nn.Dropout(p=0.3), # 3rd 2D convolution layer identical to last except output dim nn.Conv2d( in_channels=32, out_channels=64, # expand output feature map volume's depth to 64 kernel_size=3, stride=1, padding=1 ), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=4, stride=4), nn.Dropout(p=0.3), ) ############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############ # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8) self.conv2Dblock2 = nn.Sequential( # 1st 2D convolution layer nn.Conv2d( in_channels=1, # input volume depth == input channel dim == 1 out_channels=16, # expand output feature map volume's depth to 16 kernel_size=3, # typical 3*3 stride 1 kernel stride=1, padding=1 ), nn.BatchNorm2d(16), # batch normalize the output feature map before activation nn.ReLU(), # feature map --> activation map nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training # 2nd 2D convolution layer identical to last except output dim, maxpool kernel nn.Conv2d( in_channels=16, out_channels=32, # expand output feature map volume's depth to 32 kernel_size=3, stride=1, padding=1 ), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters nn.Dropout(p=0.3), # 3rd 2D convolution layer identical to last except output dim nn.Conv2d( in_channels=32, out_channels=64, # expand output feature map volume's depth to 64 kernel_size=3, stride=1, padding=1 ), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=4, stride=4), nn.Dropout(p=0.3), ) ################# FINAL LINEAR BLOCK #################### # Linear softmax layer to take final concatenated embedding tensor # from parallel 2D convolutional and transformer blocks, output 8 logits # Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array # 512*2+40 == 1064 input features --> 8 output emotions self.fc1_linear = nn.Linear(512*2+40,num_emotions) ### Softmax layer for the 8 output logits from final FC linear layer self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding # define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks def forward(self,x): ############ 1st parallel Conv2D block: 4 Convolutional layers ############################ # create final feature embedding from 1st convolutional layer # input features pased through 4 sequential 2D convolutional layers conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time # flatten final 64*1*4 feature map from convolutional layers to length 256 1D array # skip the 1st (N/batch) dimension when flattening conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1) ############ 2nd parallel Conv2D block: 4 Convolutional layers ############################# # create final feature embedding from 2nd convolutional layer # input features pased through 4 sequential 2D convolutional layers conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time # flatten final 64*1*4 feature map from convolutional layers to length 256 1D array # skip the 1st (N/batch) dimension when flattening conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1) ########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ############## # maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70 x_maxpool = self.transformer_maxpool(x) # remove channel dim: 1*40*70 --> 40*70 x_maxpool_reduced = torch.squeeze(x_maxpool,1) # convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format # because transformer encoder layer requires tensor in format: time * batch * embedding (freq) x = x_maxpool_reduced.permute(2,0,1) # finally, pass reduced input feature map x into transformer encoder layers transformer_output = self.transformer_encoder(x) # create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim) # transformer outputs 64*140 (freq embedding*time) feature map, take mean of all columns i.e. take time average transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40 ############# concatenate freq embeddings from convolutional and transformer blocks ###### # concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1) ######### final FC linear layer, need logits for loss ######################### output_logits = self.fc1_linear(complete_embedding) ######### Final Softmax layer: use logits from FC linear, get softmax for prediction ###### output_softmax = self.softmax_out(output_logits) # need output logits to compute cross entropy loss, need softmax probabilities to predict class return output_logits, output_softmax """## 网络张量分析 We zero-pad 1 the input feature map to each convolutional layer to get back from the layer the same shape tensor as we input: zero-pad 1 adds 2 to each of (H, W) dims, and the 3x3, stride 1 kernels cuts off (kernel - stride == 2) dims from each of (H,W). **Zero-pad 1 --> 3x3 stride 1 kernel effectively throws away the zero pads to get same input/output shape from each conv2D block.** At the end of first convolutional layer in each block we have a maxpool kernel of size 2x2, stride 2 which will take 1 of 4 pixels in its winddow. For the input feature map the maxpool kernel will progress 128/2 = 64 times over the rows and 563/2=281 times over the columns. **Nonoverlapping maxpool kernel reduces each output dim to input dim/kernel size.** We then expand the output channels to 16 making an output feature map of (16x64x281). The next two convolutional layers in each block have a maxpool kernel size 8x8, stride 8. Same math as above, maxpool reduces each dim/8. 2nd conv layer takes (16x64x281) --> (32x8x35). 3rd and final conv layer takes (32x8x35) --> (64x1x4). **Note that in (N,C,H,W) format, for MFCCs H = MFCC (pitch), W = time step.** **Complete flow through each convolutional block (C,H,W):** Layer 1 ---> 1x128x563 --> PAD-1 --> 1x130x565 --> FILTER --> 16x128x563 --> MAXPOOL 2x2 stride 2 --> 16x64x281 Layer 2 ---> 16x64x281 --> PAD-1 --> 16x66x283 --> FILTER --> 32x64x281 --> MAXPOOL 8x8 stride 8 --> 32x8x35 Layer 3 ---> 32x8x35 --> PAD-1 --> 32x10x37 --> FILTER --> 64x8x35 --> MAXPOOL 8x8 stride 8 --> 64x1x4 Flatten ---> 64x1x4 --> Final convolutional embedding length 256 1D array **Complete flow through transformer encoder block (C,H,W):** Maxpool 2x4 stride 2x4 ---> 1x128x563 --> 1x64x140 Drop channel ---> 1x64x140 --> 64x140 (H,W) Change dims ---> 64x140 --> 140x64 (W,H) 4*Transformer encoder ---> 140x64 --> 2x64 (W,H) Time average ---> 2x64 --> 1x64 --> Final transformer embedding length 64 1D array **FC Linear network (C,H,W):** Concatenate ---> 256x256x64 --> 576 FC Linear layer ---> 576 --> Final linear logits output length 8 1D array Softmax layer: 8 ----> 1 predicted emotion / max probability class We can confirm our network's tensor shapes and flow using the excellent torchsummary package which provides a PyTorch implementation of Keras' model.summary method: """ from torchsummary import summary # need device to instantiate model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # instantiate model for 8 emotions and move to CPU for summary model = parallel_all_you_want(len(emotions_dict)).to(device) data = torch.randint(255, size=(2, 1, 40, 282)).float() print("data: ", data.type()) model(data) # include input feature map dims in call to summary() summary(model, input_size=(1,40,282))