# PMEmo: A Dataset for Music Emotion Recognition
Hui Zhang, Kejun Zhang, Yehang Yin, BaiXi Xing, Lingyun Sun, Shouqian Sun

## Baselines in Static Emotion Recognition
This notebook evaluates: 
* standard regressors from scikit-learn on the statcic audio features.
* standard regressors from scikit-learn on the lyric features.
* standard regressors from scikit-learn on the static EDA features.
* multimodal emotion recognition based on fusion featrues.

In [17]:
import pandas as pd
import os
import numpy as np
from math import sqrt

from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD

### Loading Data

In [4]:
DATASET_DIR = 'dataset'

def load_static_features():
    features_csv = os.path.join(DATASET_DIR, 'static_features.csv')
    static_features= pd.read_csv(features_csv, index_col=0)
    return static_features

def load_static_features_and_valence():
    static_features = load_static_features()
    valence_csv = os.path.join(DATASET_DIR, 'static_annotations.csv')
    valence = pd.read_csv(valence_csv, index_col=0, usecols=['musicId','Valence(mean)'])
    return static_features.join(valence).dropna()

def load_static_features_and_arousal():
    static_features = load_static_features()
    arousal_csv = os.path.join(DATASET_DIR, 'static_annotations.csv')
    arousal = pd.read_csv(arousal_csv, index_col=0, usecols=['musicId','Arousal(mean)'])
    return static_features.join(arousal).dropna()

def load_audio_dataset(data):
    features = data[data.columns[:-1]].values
    labels = data[data.columns[-1]].values
#     scaler = StandardScaler(copy=False)
#     scaler.fit_transform(features)
    return features, labels

### Metric and Multiple Regressors

In [5]:
def rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))


regressors = {
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Ridge': Ridge(),
    'kNN': KNeighborsRegressor(),
    'SVRrbf': SVR(kernel='rbf', gamma='scale'),
    'SVRpoly': SVR(kernel='poly', gamma='scale'),
    'SVRlinear': SVR(kernel='linear', gamma='scale'),
    'DT': DecisionTreeRegressor(max_depth=5),
    'RF': RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1),
#     'MLP': MLPRegressor(hidden_layer_sizes=(200,50), max_iter=2000),
#     'AdaBoost': AdaBoostRegressor(n_estimators=10),
}

In [6]:
from tqdm import tqdm_notebook
import IPython.display as ipd

def cross_val_regression(regressors, features, labels, preprocessfunc):
    columns = list(regressors.keys())
    scores = pd.DataFrame(columns=columns, index=['RMSE'])

    for reg_name, reg in tqdm_notebook(regressors.items(), desc='regressors'):
        scorer = {'rmse': make_scorer(rmse)}
        reg = make_pipeline(*preprocessfunc, reg)
        reg_score = cross_validate(reg, features, labels, scoring=scorer, cv=10, return_train_score=False) 
        scores.loc['RMSE', reg_name] = reg_score['test_rmse'].mean()
#         scores.loc['R', reg_name] = reg_score['test_r'].mean()

    mean_rmse = scores.mean(axis=1)
    std_rmse = scores.std(axis=1)
    
    scores['Mean'] = mean_rmse
    scores['std'] = std_rmse
    return scores

def format_scores(scores):
    def highlight(s):
        is_min = s == min(s)
#         is_max = s == max(s)
#         is_max_or_min = (is_min | is_max)
        return ['background-color: yellow' if v else '' for v in is_min]
    scores = scores.style.apply(highlight, axis=1, subset=pd.IndexSlice[:, :scores.columns[-2]])
    return scores.format('{:.3f}')

### Multiple Regressors on Audio Features

Evaluating regressors on 6373-dim audio features.

In [5]:
prefunc = [StandardScaler()]

print('In Arousal dimension...')
data_a = load_static_features_and_arousal()
features_a, labels_a = load_audio_dataset(data_a)

scores_a_a = cross_val_regression(regressors, features_a, labels_a, prefunc)
ipd.display(format_scores(scores_a_a))

print('In Valence dimension...')
data_v = load_static_features_and_valence()
features_v, labels_v = load_audio_dataset(data_v)

scores_a_v = cross_val_regression(regressors, features_v, labels_v, prefunc)
ipd.display(format_scores(scores_a_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.184,0.184,0.14,0.136,0.119,0.227,0.11,0.129,0.154,0.154,0.038


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.162,0.162,0.163,0.135,0.121,0.225,0.122,0.141,0.14,0.152,0.032


### Multiple Regressors on Lyric Features

Evaluating regressors on lyric features extracted using Bag-of-Words representation:

* tokenizing with deleting stop-words and stemming.
* counting the occurrences of tokens in each document.
* normalizing and weighting using tf-idf (term-frequency inverse-document-frequency) function.

In [15]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [11]:
lyric_dataset = pd.read_csv('lrc_dataset.csv', index_col=0)
X = lyric_dataset['lrc_text']
# tf_idf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
# sel = TruncatedSVD(n_components=5000)
stemmed_tf_idf = StemmedTfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
prefunc = [stemmed_tf_idf]

print('In Arousal dimension...')
y_a = lyric_dataset['Arousal(mean)']
scores_l_a = cross_val_regression(regressors, X, y_a, prefunc)
ipd.display(format_scores(scores_l_a))

print('In Valence dimension...')
y_v = lyric_dataset['Valence(mean)']
scores_l_v = cross_val_regression(regressors, X, y_v, prefunc)
ipd.display(format_scores(scores_l_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.183,0.183,0.17,0.18,0.176,0.183,0.173,0.188,0.182,0.18,0.006


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.16,0.16,0.132,0.145,0.146,0.16,0.135,0.146,0.159,0.149,0.011


### Multiple Regressors on EDA Features

Evaluating regressors on EDA features.

In [42]:
eda_dataset = pd.read_csv('eda_dataset.csv')
eda_dataset = eda_dataset.groupby(by=['musicId'], as_index=False).mean()

In [9]:
eda_features = eda_dataset[eda_dataset.columns[2:-2]].astype(float)
prefunc = [StandardScaler()]

print('In Arousal dimension...')
scores_eda_a = cross_val_regression(regressors, eda_features, eda_dataset['Arousal(mean)'], prefunc)
ipd.display(format_scores(scores_eda_a))

print('In Valence dimension...')
scores_eda_v = cross_val_regression(regressors, eda_features, eda_dataset['Valence(mean)'], prefunc)
ipd.display(format_scores(scores_eda_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.184,0.184,0.195,0.197,0.2,0.205,0.202,0.203,0.185,0.195,0.008


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.162,0.162,0.171,0.173,0.175,0.177,0.174,0.175,0.163,0.17,0.006


### Multimodal Emotion Recognition Based on Fusion Featrues.

Evaluating multimodal fusion methods using early-fusion-by-feature-concatenation (EFFC), which means concatenate the audio and text features to a single feature vector and train a single classification model.

####  Audio + Lyric Fusion

In [19]:
lyric_dataset = pd.read_csv('lrc_dataset.csv', index_col=0)
audio_features = load_static_features()
fusion_dataset = lyric_dataset.join(audio_features)
# fusion_dataset['lrc_text'] = fusion_dataset['lrc_text'].astype(str)

fusion_features = fusion_dataset.drop(columns=['Arousal(mean)', 'Valence(mean)'])
arousal = fusion_dataset['Arousal(mean)']
valence = fusion_dataset['Valence(mean)']

In [23]:
ct = ColumnTransformer(
    [('stemmedtf-idf', StemmedTfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'), 'lrc_text'),
     ("norm2", StandardScaler(), slice(1, 6374))])

print('In Arousal dimension...')
scores_f_a = cross_val_regression(regressors, fusion_features, arousal, [ct])
ipd.display(format_scores(scores_f_a))

print('In Valence dimension...')
scores_f_v = cross_val_regression(regressors, fusion_features, valence, [ct])
ipd.display(format_scores(scores_f_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.183,0.183,0.135,0.136,0.117,0.396,0.105,0.138,0.162,0.173,0.088


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.16,0.16,0.159,0.13,0.12,0.197,0.121,0.154,0.143,0.149,0.024


####  Audio + EDA Fusion

In [7]:
eda_dataset = pd.read_csv('eda_dataset.csv').groupby(by=['musicId'], as_index=False).mean()
audio_features = load_static_features()

fusion_dataset = pd.merge(eda_dataset, audio_features, on=['musicId']).dropna()
# fusion_dataset['lrc_text'] = fusion_dataset['lrc_text'].astype(str)
fusion_features = fusion_dataset.drop(columns=['musicId', 'subjectId', 'Arousal(mean)', 'Valence(mean)'])
arousal = fusion_dataset['Arousal(mean)']
valence = fusion_dataset['Valence(mean)']

In [8]:
prefunc = [StandardScaler()]

print('In Arousal dimension...')
scores_f_a = cross_val_regression(regressors, fusion_features, arousal, prefunc)
ipd.display(format_scores(scores_f_a))

print('In Valence dimension...')
scores_f_v = cross_val_regression(regressors, fusion_features, valence, prefunc)
ipd.display(format_scores(scores_f_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.184,0.184,0.139,0.136,0.119,0.23,0.107,0.13,0.157,0.154,0.039


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.162,0.162,0.173,0.135,0.121,0.197,0.121,0.14,0.142,0.15,0.025


####  EDA + Lyric  Fusion

In [19]:
eda_dataset = pd.read_csv('eda_dataset.csv').groupby(by=['musicId'], as_index=False).mean()
lyric_text = pd.read_csv('lrc_dataset.csv').drop(columns=['Arousal(mean)', 'Valence(mean)'])

fusion_dataset = pd.merge(eda_dataset, lyric_text, on=['musicId']).dropna()
# fusion_dataset['lrc_text'] = fusion_dataset['lrc_text'].astype(str)
fusion_features = fusion_dataset.drop(columns=['musicId', 'subjectId', 'Arousal(mean)', 'Valence(mean)'])
arousal = fusion_dataset['Arousal(mean)']
valence = fusion_dataset['Valence(mean)']

In [29]:
ct = ColumnTransformer(
    [('stemmedtf-idf', StemmedTfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'), 'lrc_text'),
     ("norm2", StandardScaler(), slice(0, 128))])

print('In Arousal dimension...')
scores_f_a = cross_val_regression(regressors, fusion_features, arousal, [ct])
ipd.display(format_scores(scores_f_a))

print('In Valence dimension...')
scores_f_v = cross_val_regression(regressors, fusion_features, valence, [ct])
ipd.display(format_scores(scores_f_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.184,0.184,0.183,0.193,0.185,0.183,0.188,0.194,0.183,0.186,0.004


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.16,0.16,0.146,0.169,0.161,0.159,0.149,0.156,0.159,0.158,0.007


####  Audio + EDA + Lyric  Fusion

In [21]:
eda_dataset = pd.read_csv('eda_dataset.csv').groupby(by=['musicId'], as_index=False).mean()
lyric_text = pd.read_csv('lrc_dataset.csv').drop(columns=['Arousal(mean)', 'Valence(mean)'])
audio_features = load_static_features()

fusion_dataset = pd.merge(eda_dataset, audio_features, on=['musicId'])
fusion_dataset = pd.merge(fusion_dataset, lyric_text, on=['musicId']).dropna()
# fusion_dataset['lrc_text'] = fusion_dataset['lrc_text'].astype(str)
fusion_features = fusion_dataset.drop(columns=['musicId', 'subjectId', 'Arousal(mean)', 'Valence(mean)'])
arousal = fusion_dataset['Arousal(mean)']
valence = fusion_dataset['Valence(mean)']

In [32]:
ct = ColumnTransformer(
    [('stemmedtf-idf', StemmedTfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'), 'lrc_text'),
     ("norm2", StandardScaler(), slice(0, 128+6373))])

print('In Arousal dimension...')
scores_f_a = cross_val_regression(regressors, fusion_features, arousal, [ct])
ipd.display(format_scores(scores_f_a))

print('In Valence dimension...')
scores_f_v = cross_val_regression(regressors, fusion_features, valence, [ct])
ipd.display(format_scores(scores_f_v))

In Arousal dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.184,0.184,0.145,0.136,0.117,0.266,0.109,0.133,0.164,0.16,0.048


In Valence dimension...


HBox(children=(IntProgress(value=0, description='regressors', max=9), HTML(value='')))




Unnamed: 0,Lasso,ElasticNet,Ridge,kNN,SVRrbf,SVRpoly,SVRlinear,DT,RF,Mean,std
RMSE,0.16,0.16,0.148,0.13,0.119,0.366,0.118,0.155,0.145,0.167,0.076
