Graduation_Project/LYZ/b.py

# -*- coding: utf-8 -*-


import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, glob
import warnings; warnings.filterwarnings('ignore') #matplot lib complains about librosa


# RAVDESS native sample rate is 48k
sample_rate = 48000

# Mel Spectrograms are not directly used as a feature in this model
# Mel Spectrograms are used in calculating MFCCs, which are a higher-level representation of pitch transition
# MFCCs work better - left the mel spectrogram function here in case anyone wants to experiment
def feature_melspectrogram(
    waveform, 
    sample_rate,
    fft = 1024,
    winlen = 512,
    window='hamming',
    hop=256,
    mels=128,
    ):
    
    # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram = librosa.feature.melspectrogram(
        y=waveform, 
        sr=sample_rate, 
        n_fft=fft, 
        win_length=winlen, 
        window=window, 
        hop_length=hop, 
        n_mels=mels, 
        fmax=sample_rate/2)
    
    # convert from power (amplitude**2) to decibels
    # necessary for network to learn - doesn't converge with raw power spectrograms 
    melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
    
    return melspectrogram

def feature_mfcc(
    waveform, 
    sample_rate,
    n_mfcc = 40,
    fft = 1024,
    winlen = 512,
    window='hamming',
    #hop=256, # increases # of time steps; was not helpful
    mels=128
    ):

    # Compute the MFCCs for all STFT frames 
    # 40 mel filterbanks (n_mfcc) = 40 coefficients
    mfc_coefficients=librosa.feature.mfcc(
        y=waveform, 
        sr=sample_rate, 
        n_mfcc=n_mfcc,
        n_fft=fft, 
        win_length=winlen, 
        window=window, 
        #hop_length=hop, 
        n_mels=mels, 
        fmax=sample_rate/2
        ) 

    return mfc_coefficients

def get_features(waveforms, features, samplerate):

    # initialize counter to track progress
    file_count = 0

    # process each waveform individually to get its MFCCs
    for waveform in waveforms:
        mfccs = feature_mfcc(waveform, sample_rate)
        features.append(mfccs)
        file_count += 1
        # print progress 
        print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')
    
    # return all features from list of waveforms
    return features

def get_waveforms(file):
    
    # load an individual sample audio file
    # read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k
    # don't need to store the sample rate that librosa.load returns
    waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)
    
    # make sure waveform vectors are homogenous by defining explicitly
    waveform_homo = np.zeros((int(sample_rate*3,)))
    waveform_homo[:len(waveform)] = waveform
    
    # return a single file's waveform                                      
    return waveform_homo
    
# RAVDESS dataset emotions
# shift emotions left to be 0 indexed for PyTorch
emotions_dict ={
    '0':'surprised',
    '1':'neutral',
    '2':'calm',
    '3':'happy',
    '4':'sad',
    '5':'angry',
    '6':'fearful',
    '7':'disgust'
}

# Additional attributes from RAVDESS to play with
emotion_attributes = {
    '01': 'normal',
    '02': 'strong'
}

class parallel_all_you_want(nn.Module):
    # Define all layers present in the network
    def __init__(self,num_emotions):
        super().__init__() 
        
        ################ TRANSFORMER BLOCK #############################
        # maxpool the input feature map/tensor to the transformer 
        # a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
        self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
        
        # define single transformer encoder layer
        # self-attention + feedforward network from "Attention is All You Need" paper
        # 4 multi-head self-attention layers each with 64-->512--->64 feedforward network
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=40, # input feature (frequency) dim after maxpooling 128*563 -> 64*140 (freq*time)
            nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
            dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64
            dropout=0.4, 
            activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
        )
        
        # I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper
        # Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
        
        ############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock1 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels=1, # input volume depth == input channel dim == 1
                out_channels=16, # expand output feature map volume's depth to 16
                kernel_size=3, # typical 3*3 stride 1 kernel
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(), # feature map --> activation map
            nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
            nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
            
            # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
            nn.Conv2d(
                in_channels=16, 
                out_channels=32, # expand output feature map volume's depth to 32
                kernel_size=3,
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
            nn.Dropout(p=0.3), 
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels=32,
                out_channels=64, # expand output feature map volume's depth to 64
                kernel_size=3,
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
        )
        ############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############
        # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
        self.conv2Dblock2 = nn.Sequential(
            
            # 1st 2D convolution layer
            nn.Conv2d(
                in_channels=1, # input volume depth == input channel dim == 1
                out_channels=16, # expand output feature map volume's depth to 16
                kernel_size=3, # typical 3*3 stride 1 kernel
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(16), # batch normalize the output feature map before activation
            nn.ReLU(), # feature map --> activation map
            nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
            nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
            
            # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
            nn.Conv2d(
                in_channels=16, 
                out_channels=32, # expand output feature map volume's depth to 32
                kernel_size=3,
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
            nn.Dropout(p=0.3), 
            
            # 3rd 2D convolution layer identical to last except output dim
            nn.Conv2d(
                in_channels=32,
                out_channels=64, # expand output feature map volume's depth to 64
                kernel_size=3,
                stride=1,
                padding=1
                      ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
        )

        ################# FINAL LINEAR BLOCK ####################
        # Linear softmax layer to take final concatenated embedding tensor 
        #    from parallel 2D convolutional and transformer blocks, output 8 logits 
        # Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array 
        # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
        # 512*2+40 == 1064 input features --> 8 output emotions 
        self.fc1_linear = nn.Linear(512*2+40,num_emotions) 
        
        ### Softmax layer for the 8 output logits from final FC linear layer 
        self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
        
    # define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
    def forward(self,x):
        
        ############ 1st parallel Conv2D block: 4 Convolutional layers ############################
        # create final feature embedding from 1st convolutional layer 
        # input features pased through 4 sequential 2D convolutional layers
        conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
        
        # flatten final 64*1*4 feature map from convolutional layers to length 256 1D array 
        # skip the 1st (N/batch) dimension when flattening
        conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1) 
        
        ############ 2nd parallel Conv2D block: 4 Convolutional layers #############################
        # create final feature embedding from 2nd convolutional layer 
        # input features pased through 4 sequential 2D convolutional layers
        conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
        
        # flatten final 64*1*4 feature map from convolutional layers to length 256 1D array 
        # skip the 1st (N/batch) dimension when flattening
        conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1) 
        
         
        ########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ##############
        # maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70
        x_maxpool = self.transformer_maxpool(x)

        # remove channel dim: 1*40*70 --> 40*70
        x_maxpool_reduced = torch.squeeze(x_maxpool,1)
        
        # convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
        # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
        x = x_maxpool_reduced.permute(2,0,1) 
        
        # finally, pass reduced input feature map x into transformer encoder layers
        transformer_output = self.transformer_encoder(x)
        
        # create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
        # transformer outputs 64*140 (freq embedding*time) feature map, take mean of all columns i.e. take time average
        transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
        
        ############# concatenate freq embeddings from convolutional and transformer blocks ######
        # concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
        complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)  

        ######### final FC linear layer, need logits for loss #########################
        output_logits = self.fc1_linear(complete_embedding)  
        
        ######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######
        output_softmax = self.softmax_out(output_logits)
        
        # need output logits to compute cross entropy loss, need softmax probabilities to predict class
        return output_logits, output_softmax

"""## 网络张量分析
We zero-pad 1 the input feature map to each convolutional layer to get back from the layer the same shape tensor as we input: zero-pad 1 adds 2 to each of (H, W) dims, and the 3x3, stride 1 kernels cuts off (kernel - stride == 2) dims from each of (H,W). **Zero-pad 1 --> 3x3 stride 1 kernel effectively throws away the zero pads to get same input/output shape from each conv2D block.**

At the end of first convolutional layer in each block we have a maxpool kernel of size 2x2, stride 2 which will take 1 of 4 pixels in its winddow. For the input feature map the maxpool kernel will progress 128/2 = 64 times over the rows and 563/2=281 times over the columns. **Nonoverlapping maxpool kernel reduces each output dim to input dim/kernel size.** We then expand the output channels to 16 making an output feature map of (16x64x281). 

The next two convolutional layers in each block have a maxpool kernel size 8x8, stride 8. Same math as above, maxpool reduces each dim/8. 2nd conv layer takes (16x64x281) --> (32x8x35). 3rd and final conv layer takes (32x8x35) --> (64x1x4).

**Note that in (N,C,H,W) format, for MFCCs H = MFCC (pitch),  W = time step.**

**Complete flow through each convolutional block (C,H,W):**

    Layer 1 ---> 1x128x563 --> PAD-1 --> 1x130x565 --> FILTER --> 16x128x563 --> MAXPOOL 2x2 stride 2 --> 16x64x281

    Layer 2 ---> 16x64x281 --> PAD-1 --> 16x66x283 --> FILTER --> 32x64x281 --> MAXPOOL 8x8 stride 8 --> 32x8x35

    Layer 3 ---> 32x8x35 --> PAD-1 --> 32x10x37 --> FILTER --> 64x8x35 --> MAXPOOL 8x8 stride 8 --> 64x1x4 

    Flatten ---> 64x1x4 --> Final convolutional embedding length 256 1D array


**Complete flow through transformer encoder block (C,H,W):**

    Maxpool 2x4 stride 2x4 ---> 1x128x563 --> 1x64x140

    Drop channel ---> 1x64x140 --> 64x140 (H,W)

    Change dims ---> 64x140 --> 140x64 (W,H)

    4*Transformer encoder ---> 140x64 --> 2x64 (W,H)

    Time average ---> 2x64 --> 1x64 --> Final transformer embedding length 64 1D array

**FC Linear network (C,H,W):**

    Concatenate ---> 256x256x64 --> 576 

    FC Linear layer ---> 576 --> Final linear logits output length 8 1D array

    Softmax layer: 8 ----> 1 predicted emotion / max probability class


We can confirm our network's tensor shapes and flow using the excellent torchsummary package which provides a PyTorch implementation of Keras' model.summary method:
"""

from torchsummary import summary

# need device to instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# instantiate model for 8 emotions and move to CPU for summary
model = parallel_all_you_want(len(emotions_dict)).to(device)


data = torch.randint(255, size=(2, 1, 40, 282)).float()
print("data: ", data.type())
model(data)

# include input feature map dims in call to summary()
summary(model, input_size=(1,40,282))
first 2024-06-29 14:23:18 +08:00			`# -- coding: utf-8 --`


			`import torch`
			`import torch.nn as nn`
			`import matplotlib.pyplot as plt`
			`import numpy as np`
			`import pandas as pd`
			`import os, glob`
			`import warnings; warnings.filterwarnings('ignore') #matplot lib complains about librosa`


			`# RAVDESS native sample rate is 48k`
			`sample_rate = 48000`

			`# Mel Spectrograms are not directly used as a feature in this model`
			`# Mel Spectrograms are used in calculating MFCCs, which are a higher-level representation of pitch transition`
			`# MFCCs work better - left the mel spectrogram function here in case anyone wants to experiment`
			`def feature_melspectrogram(`
			`waveform,`
			`sample_rate,`
			`fft = 1024,`
			`winlen = 512,`
			`window='hamming',`
			`hop=256,`
			`mels=128,`
			`):`

			`# Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array`
			`# Using 8khz as upper frequency bound should be enough for most speech classification tasks`
			`melspectrogram = librosa.feature.melspectrogram(`
			`y=waveform,`
			`sr=sample_rate,`
			`n_fft=fft,`
			`win_length=winlen,`
			`window=window,`
			`hop_length=hop,`
			`n_mels=mels,`
			`fmax=sample_rate/2)`

			`# convert from power (amplitude**2) to decibels`
			`# necessary for network to learn - doesn't converge with raw power spectrograms`
			`melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)`

			`return melspectrogram`

			`def feature_mfcc(`
			`waveform,`
			`sample_rate,`
			`n_mfcc = 40,`
			`fft = 1024,`
			`winlen = 512,`
			`window='hamming',`
			`#hop=256, # increases # of time steps; was not helpful`
			`mels=128`
			`):`

			`# Compute the MFCCs for all STFT frames`
			`# 40 mel filterbanks (n_mfcc) = 40 coefficients`
			`mfc_coefficients=librosa.feature.mfcc(`
			`y=waveform,`
			`sr=sample_rate,`
			`n_mfcc=n_mfcc,`
			`n_fft=fft,`
			`win_length=winlen,`
			`window=window,`
			`#hop_length=hop,`
			`n_mels=mels,`
			`fmax=sample_rate/2`
			`)`

			`return mfc_coefficients`

			`def get_features(waveforms, features, samplerate):`

			`# initialize counter to track progress`
			`file_count = 0`

			`# process each waveform individually to get its MFCCs`
			`for waveform in waveforms:`
			`mfccs = feature_mfcc(waveform, sample_rate)`
			`features.append(mfccs)`
			`file_count += 1`
			`# print progress`
			`print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')`

			`# return all features from list of waveforms`
			`return features`

			`def get_waveforms(file):`

			`# load an individual sample audio file`
			`# read the full 3 seconds of the file, cut off the first 0.5s of silence; native sample rate = 48k`
			`# don't need to store the sample rate that librosa.load returns`
			`waveform, _ = librosa.load(file, duration=3, offset=0.5, sr=sample_rate)`

			`# make sure waveform vectors are homogenous by defining explicitly`
			`waveform_homo = np.zeros((int(sample_rate*3,)))`
			`waveform_homo[:len(waveform)] = waveform`

			`# return a single file's waveform`
			`return waveform_homo`

			`# RAVDESS dataset emotions`
			`# shift emotions left to be 0 indexed for PyTorch`
			`emotions_dict ={`
			`'0':'surprised',`
			`'1':'neutral',`
			`'2':'calm',`
			`'3':'happy',`
			`'4':'sad',`
			`'5':'angry',`
			`'6':'fearful',`
			`'7':'disgust'`
			`}`

			`# Additional attributes from RAVDESS to play with`
			`emotion_attributes = {`
			`'01': 'normal',`
			`'02': 'strong'`
			`}`

			`class parallel_all_you_want(nn.Module):`
			`# Define all layers present in the network`
			`def __init__(self,num_emotions):`
			`super().__init__()`

			`################ TRANSFORMER BLOCK #############################`
			`# maxpool the input feature map/tensor to the transformer`
			`# a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor`
			`self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])`

			`# define single transformer encoder layer`
			`# self-attention + feedforward network from "Attention is All You Need" paper`
			`# 4 multi-head self-attention layers each with 64-->512--->64 feedforward network`
			`transformer_layer = nn.TransformerEncoderLayer(`
			`d_model=40, # input feature (frequency) dim after maxpooling 128563 -> 64140 (freq*time)`
			`nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block`
			`dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 64-->512--->64`
			`dropout=0.4,`
			`activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time`
			`)`

			`# I'm using 4 instead of the 6 identical stacked encoder layrs used in Attention is All You Need paper`
			`# Complete transformer block contains 4 full transformer encoder layers (each w/ multihead self-attention+feedforward)`
			`self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)`

			`############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############`
			`# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)`
			`self.conv2Dblock1 = nn.Sequential(`

			`# 1st 2D convolution layer`
			`nn.Conv2d(`
			`in_channels=1, # input volume depth == input channel dim == 1`
			`out_channels=16, # expand output feature map volume's depth to 16`
			`kernel_size=3, # typical 3*3 stride 1 kernel`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(16), # batch normalize the output feature map before activation`
			`nn.ReLU(), # feature map --> activation map`
			`nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size`
			`nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training`

			`# 2nd 2D convolution layer identical to last except output dim, maxpool kernel`
			`nn.Conv2d(`
			`in_channels=16,`
			`out_channels=32, # expand output feature map volume's depth to 32`
			`kernel_size=3,`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(32),`
			`nn.ReLU(),`
			`nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters`
			`nn.Dropout(p=0.3),`

			`# 3rd 2D convolution layer identical to last except output dim`
			`nn.Conv2d(`
			`in_channels=32,`
			`out_channels=64, # expand output feature map volume's depth to 64`
			`kernel_size=3,`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(64),`
			`nn.ReLU(),`
			`nn.MaxPool2d(kernel_size=4, stride=4),`
			`nn.Dropout(p=0.3),`
			`)`
			`############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############`
			`# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)`
			`self.conv2Dblock2 = nn.Sequential(`

			`# 1st 2D convolution layer`
			`nn.Conv2d(`
			`in_channels=1, # input volume depth == input channel dim == 1`
			`out_channels=16, # expand output feature map volume's depth to 16`
			`kernel_size=3, # typical 3*3 stride 1 kernel`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(16), # batch normalize the output feature map before activation`
			`nn.ReLU(), # feature map --> activation map`
			`nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size`
			`nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training`

			`# 2nd 2D convolution layer identical to last except output dim, maxpool kernel`
			`nn.Conv2d(`
			`in_channels=16,`
			`out_channels=32, # expand output feature map volume's depth to 32`
			`kernel_size=3,`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(32),`
			`nn.ReLU(),`
			`nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters`
			`nn.Dropout(p=0.3),`

			`# 3rd 2D convolution layer identical to last except output dim`
			`nn.Conv2d(`
			`in_channels=32,`
			`out_channels=64, # expand output feature map volume's depth to 64`
			`kernel_size=3,`
			`stride=1,`
			`padding=1`
			`),`
			`nn.BatchNorm2d(64),`
			`nn.ReLU(),`
			`nn.MaxPool2d(kernel_size=4, stride=4),`
			`nn.Dropout(p=0.3),`
			`)`

			`################# FINAL LINEAR BLOCK ####################`
			`# Linear softmax layer to take final concatenated embedding tensor`
			`# from parallel 2D convolutional and transformer blocks, output 8 logits`
			`# Each full convolution block outputs (6418) embedding flattened to dim 512 1D array`
			`# Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array`
			`# 512*2+40 == 1064 input features --> 8 output emotions`
			`self.fc1_linear = nn.Linear(512*2+40,num_emotions)`

			`### Softmax layer for the 8 output logits from final FC linear layer`
			`self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding`

			`# define one complete parallel fwd pass of input feature tensor thru 2conv+1transformer blocks`
			`def forward(self,x):`

			`############ 1st parallel Conv2D block: 4 Convolutional layers ############################`
			`# create final feature embedding from 1st convolutional layer`
			`# input features pased through 4 sequential 2D convolutional layers`
			`conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time`

			`# flatten final 6414 feature map from convolutional layers to length 256 1D array`
			`# skip the 1st (N/batch) dimension when flattening`
			`conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)`

			`############ 2nd parallel Conv2D block: 4 Convolutional layers #############################`
			`# create final feature embedding from 2nd convolutional layer`
			`# input features pased through 4 sequential 2D convolutional layers`
			`conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time`

			`# flatten final 6414 feature map from convolutional layers to length 256 1D array`
			`# skip the 1st (N/batch) dimension when flattening`
			`conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)`


			`########## 4-encoder-layer Transformer block w/ 64-->512-->64 feedfwd network ##############`
			`# maxpool input feature map: 140282 w/ 14 kernel --> 140*70`
			`x_maxpool = self.transformer_maxpool(x)`

			`# remove channel dim: 14070 --> 40*70`
			`x_maxpool_reduced = torch.squeeze(x_maxpool,1)`

			`# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format`
			`# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)`
			`x = x_maxpool_reduced.permute(2,0,1)`

			`# finally, pass reduced input feature map x into transformer encoder layers`
			`transformer_output = self.transformer_encoder(x)`

			`# create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)`
			`# transformer outputs 64140 (freq embeddingtime) feature map, take mean of all columns i.e. take time average`
			`transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40`

			`############# concatenate freq embeddings from convolutional and transformer blocks ######`
			`# concatenate embedding tensors output by parallel 2conv and 1transformer blocks`
			`complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)`

			`######### final FC linear layer, need logits for loss #########################`
			`output_logits = self.fc1_linear(complete_embedding)`

			`######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######`
			`output_softmax = self.softmax_out(output_logits)`

			`# need output logits to compute cross entropy loss, need softmax probabilities to predict class`
			`return output_logits, output_softmax`

			`"""## 网络张量分析`
			`We zero-pad 1 the input feature map to each convolutional layer to get back from the layer the same shape tensor as we input: zero-pad 1 adds 2 to each of (H, W) dims, and the 3x3, stride 1 kernels cuts off (kernel - stride == 2) dims from each of (H,W). Zero-pad 1 --> 3x3 stride 1 kernel effectively throws away the zero pads to get same input/output shape from each conv2D block.`

			`At the end of first convolutional layer in each block we have a maxpool kernel of size 2x2, stride 2 which will take 1 of 4 pixels in its winddow. For the input feature map the maxpool kernel will progress 128/2 = 64 times over the rows and 563/2=281 times over the columns. Nonoverlapping maxpool kernel reduces each output dim to input dim/kernel size. We then expand the output channels to 16 making an output feature map of (16x64x281).`

			`The next two convolutional layers in each block have a maxpool kernel size 8x8, stride 8. Same math as above, maxpool reduces each dim/8. 2nd conv layer takes (16x64x281) --> (32x8x35). 3rd and final conv layer takes (32x8x35) --> (64x1x4).`

			`Note that in (N,C,H,W) format, for MFCCs H = MFCC (pitch), W = time step.`

			`Complete flow through each convolutional block (C,H,W):`

			`Layer 1 ---> 1x128x563 --> PAD-1 --> 1x130x565 --> FILTER --> 16x128x563 --> MAXPOOL 2x2 stride 2 --> 16x64x281`

			`Layer 2 ---> 16x64x281 --> PAD-1 --> 16x66x283 --> FILTER --> 32x64x281 --> MAXPOOL 8x8 stride 8 --> 32x8x35`

			`Layer 3 ---> 32x8x35 --> PAD-1 --> 32x10x37 --> FILTER --> 64x8x35 --> MAXPOOL 8x8 stride 8 --> 64x1x4`

			`Flatten ---> 64x1x4 --> Final convolutional embedding length 256 1D array`


			`Complete flow through transformer encoder block (C,H,W):`

			`Maxpool 2x4 stride 2x4 ---> 1x128x563 --> 1x64x140`

			`Drop channel ---> 1x64x140 --> 64x140 (H,W)`

			`Change dims ---> 64x140 --> 140x64 (W,H)`

			`4*Transformer encoder ---> 140x64 --> 2x64 (W,H)`

			`Time average ---> 2x64 --> 1x64 --> Final transformer embedding length 64 1D array`

			`FC Linear network (C,H,W):`

			`Concatenate ---> 256x256x64 --> 576`

			`FC Linear layer ---> 576 --> Final linear logits output length 8 1D array`

			`Softmax layer: 8 ----> 1 predicted emotion / max probability class`


			`We can confirm our network's tensor shapes and flow using the excellent torchsummary package which provides a PyTorch implementation of Keras' model.summary method:`
			`"""`

			`from torchsummary import summary`

			`# need device to instantiate model`
			`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`

			`# instantiate model for 8 emotions and move to CPU for summary`
			`model = parallel_all_you_want(len(emotions_dict)).to(device)`


			`data = torch.randint(255, size=(2, 1, 40, 282)).float()`
			`print("data: ", data.type())`
			`model(data)`

			`# include input feature map dims in call to summary()`
			`summary(model, input_size=(1,40,282))`