import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.impute import KNNImputer

from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Preparing Data
# Read csv of 50 most recently played songs of user
user_df = pd.read_csv('.\processed dataset\processed_user_dataset.csv')
# Read csv of spotify mood playlist dataset
spotify_df = pd.read_csv('.\processed dataset\processed_spotify_dataset.csv')

frames = [user_df,spotify_df]
# first 50 rows are the users recently played
df = pd.concat(frames)


df.shape

(2388, 19)


msno.matrix(df)
plt.show()


labels = list(df)[6:16]
features_df = df[['danceability',
 'energy',
 'key',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo']]


# showing distribution of each audio feature across all the observations
features_df.hist(figsize = (15,15))
plt.show()


# Finding the correlation between all the audio features
scatter = sns.heatmap(features_df.corr(),  
annot=True,
linewidths=.5, cmap='RdPu')


pd.DataFrame(pd.DataFrame(df.groupby(df['artist']).
filter(lambda x: len(x)>10)).groupby("artist").energy.mean()).sort_values(by='energy',ascending=False)[:20]


df.loc[df['artist'] == 'City Morgue', 'track']

12                       16 TOES
27              33rd Blakk Glass
36                       66SLAVS
57      ACAB (feat. Nascar Aloe)
59          ALL KILLER NO FILLER
140                        Arson
149     Aw Shit - Zillakami Solo
155                       BUAKAW
320                        CRANK
321                         CYKA
325                     Caligula
459                         DAWG
557                       Downer
794                  Gravehop187
803                HURTWORLD '99
1330                  NECK BRACE
1443             PROSTHETIC LEGS
1604                  SHINNERS13
1608                    SPLINTER
1710                    Sk8 Head
1731           Snow On Tha Bluff
1857                THE BALLOONS
1858     THE ELECTRIC EXPERIENCE
2046                         V12
2179                 YELLOW PISS
Name: track, dtype: object


pd.DataFrame(pd.DataFrame(df.groupby(df['artist']).
filter(lambda x: len(x)>10)).groupby("artist").loudness.mean()).sort_values(by='loudness',ascending=False)[:20]


df.loc[df['artist'] == 'Little Mix', 'track']

63                            About the Boy
253                             Black Magic
626                                    F.U.
706                                   Freak
922     How Ya Doin'? (feat. Missy Elliott)
1038                    If You Want My Love
1071                            Joan of Arc
1218                    Love Me or Leave Me
1371                      No More Sad Songs
1387                                 Notice
1489                                  Power
1619                                 Salute
1689                     Shout Out to My Ex
1690                     Shout Out to My Ex
1845                           Sweet Melody
2084                                 Wasabi
2162      Woman Like Me (feat. Nicki Minaj)
Name: track, dtype: object


# 2-D scatter plot to visualize tendencies using some audio features, 
# specifically the "energy" and "danceability" features
# The red cross represents the "average popular song"'s audio features
average_noise = user_df['energy'].mean()
average_danceability = user_df['danceability'].mean()
plt.scatter(spotify_df['danceability'],spotify_df['energy'],alpha=0.75)
plt.axhline(y=average_noise, color='r')
plt.axvline(x=average_danceability, color='r')
plt.title("Energy as a function of Danceability - from my music library")
plt.xlabel("Danceability")
plt.ylabel("Energy")
plt.show()


# Finding the correlation between all the audio features(continuous) and the mood(categorical)
enc = LabelEncoder()
spotify_df['mood_enc'] = enc.fit_transform(spotify_df['mood'])

corr = spotify_df.iloc[:, :-1].corr()
sns.heatmap(corr,
            xticklabels=corr.columns,
            yticklabels=corr.columns,
            annot=True,
            linewidths=.2,)
plt.show()


# Word cloud of user dataset lyrics vs. spotify dataset lyrics
wordcloud_spotify = WordCloud().generate(' '.join(spotify_df['single_text']))
plt.imshow(wordcloud_spotify)
plt.axis("off")
plt.show()


wordcloud_spotify = WordCloud().generate(' '.join(user_df['single_text']))
plt.imshow(wordcloud_spotify)
plt.axis("off")
plt.show()


# NLP sentiment analysis using TextBlob
filter_values = [-1, -0.35, 0.3, 1]   

def sentiment_func(lyrics):
    try:
        return TextBlob(lyrics).sentiment
    except:
        return None

df['sentiment'] = df['single_text'].apply(sentiment_func)
df['sentiment'][0][0]
df['polarity'] = df['sentiment'].apply(lambda x: x[0])

df = df.drop(columns= 'sentiment')
df['sentiment'] = pd.cut(df['polarity'], bins=filter_values, 
                                     labels=['negative', 'neutral', 'positive'])


# NLP sentiment analysis using Vader
analyzer = SentimentIntensityAnalyzer()
df['v_sentiment'] = df['single_text'].apply(analyzer.polarity_scores)
df = pd.concat([df.drop(['v_sentiment'], axis=1), df['v_sentiment'].apply(pd.Series)], axis=1)


df['v_sentiment'] = pd.cut(df['compound'], bins=filter_values, 
                                     labels=['negative', 'neutral', 'positive'])


# showing distribution of the classifications given by Vader and TextBlob
df.groupby(["mood", "sentiment", "v_sentiment"]).size().reset_index(name="count")


# we will drop TextBlob and use Vader
df = df.drop(columns= ['sentiment','neg','pos','neu', 'polarity'])
df.rename(columns={'v_sentiment': 'sentiment', 'compound': 'polarity'}, inplace= True)


# Encode mood column
df['mood_enc'] = df.mood.map({'Angry': 0, 'Sad': 1, 'Calm': 2, 'Happy': 3, 'Energy': 4})


# knn based imputation for categorical variables
imputer = KNNImputer(n_neighbors= 2)
df_filled = imputer.fit_transform(df[['danceability',
 'energy',
 'key',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
'mood_enc']])
df_filled

array([[ 0.761 ,  0.525 , 11.    , ...,  0.0921,  0.531 ,  3.    ],
       [ 0.626 ,  0.528 ,  4.    , ...,  0.0995,  0.274 ,  1.    ],
       [ 0.559 ,  0.345 ,  4.    , ...,  0.141 ,  0.458 ,  2.5   ],
       ...,
       [ 0.561 ,  0.0848,  2.    , ...,  0.112 ,  0.206 ,  1.    ],
       [ 0.635 ,  0.673 ,  1.    , ...,  0.669 ,  0.837 ,  3.    ],
       [ 0.724 ,  0.895 ,  7.    , ...,  0.097 ,  0.64  ,  4.    ]])


imputed_mood = df_filled[:,-1].astype(int)
imputed_mood[:50]
df = df.drop(columns='mood_enc')
df['mood_enc'] = imputed_mood.tolist()
df.reset_index(drop = True, inplace=True)
df['mood'] = df['mood_enc'].map({0: 'Angry', 1: 'Sad', 2: 'Calm', 3: 'Happy', 4: 'Energy'})


df[:50]['artist'].value_counts()[:5]

Taylor Swift             4
Tears For Fears          3
Nothing But Thieves      2
Elbow                    2
Red Hot Chili Peppers    2
Name: artist, dtype: int64


df[:50]['track'].value_counts()[:5]

Everybody Wants To Rule The World    3
The Funeral                          2
gold rush                            2
Under the Bridge                     2
Chasing Cars                         2
Name: track, dtype: int64


df[:50]['mood'].value_counts()
# So our user is mainly listening to Sad and Calm songs

Sad       18
Calm      18
Happy     11
Energy     2
Angry      1
Name: mood, dtype: int64


features = ['instrumentalness','acousticness','danceability','energy','liveness','loudness','speechiness', 'tempo','valence','polarity']

def combine_all_features(row):
    return str(row['polarity']) +" "+ str(row['instrumentalness']) +" "+ str(row['acousticness']) +" "+ str(row['danceability'])+" "+ str(row["energy"])+" "+str(row["liveness"])+" "+str(row["loudness"])+" "+str(row["speechiness"])+" "+str(row["tempo"])+" "+str(row["valence"])

df['combined_features'] = df.apply(combine_all_features,axis=1)

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

def fetch_artist_from_index_1(index):
    return df[df.index == index]["artist"].values[0]

def fetch_index_from_artist_1(artist):
    return df[df.artist == artist].index.values[0]

user_choice_for_singer = "Taylor Swift"
artist_index = fetch_index_from_artist_1(user_choice_for_singer)

similar_artists =  list(enumerate(cosine_sim[artist_index]))

sim_artist_sort = sorted(similar_artists,key=lambda x:x[1],reverse=True)[2:]

i=0
print("Top 10 similar artists/singers to "+ user_choice_for_singer +" are:\n")
for element in sim_artist_sort:
    print(i+1,'->',fetch_artist_from_index_1(element[0]))
    print(f"    Similiarity score: %.4f" % sim_artist_sort[i][1])
    i=i+1
    if i>=10:
        break

Top 10 similar artists/singers to Taylor Swift are:

1 -> Leona Lewis
    Similiarity score: 0.1818
2 -> Donovan Woods
    Similiarity score: 0.1818
3 -> James Newton Howard
    Similiarity score: 0.1818
4 -> Leon Bridges
    Similiarity score: 0.1741
5 -> Stealth
    Similiarity score: 0.1741
6 -> Billie Eilish
    Similiarity score: 0.1741
7 -> Carly Rae Jepsen
    Similiarity score: 0.1741
8 -> OMI
    Similiarity score: 0.1741
9 -> Leon Bridges
    Similiarity score: 0.1741
10 -> Sticky Fingers
    Similiarity score: 0.1741


features = ['instrumentalness','acousticness','danceability','energy','liveness','loudness','speechiness', 'tempo','valence','polarity']

def combine_all_features(row):
    return str(row['polarity']) +" "+ str(row['instrumentalness']) +" "+ str(row['acousticness']) +" "+ str(row['danceability'])+" "+ str(row["energy"])+" "+str(row["liveness"])+" "+str(row["loudness"])+" "+str(row["speechiness"])+" "+str(row["tempo"])+" "+str(row["valence"])

df['combined_features'] = df.apply(combine_all_features,axis=1)

# # Initialize tfidf vectorizer
# tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# # Fit and transform 
# tfidf_matrix = tfidf.fit_transform(df['combined_features'])
# cosine_sim = cosine_similarity(tfidf_matrix)

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

def fetch_song_from_index_2(index):
    return df[df.index == index]["track"].values[0]

def fetch_artist_from_index_2(index):
    return df[df.index == index]["artist"].values[0]

def fetch_index_from_song_2(track):
    return df[df.track == track].index.values[0]

user_choice_song = "Everybody Wants To Rule The World"
song_index = fetch_index_from_song_2(user_choice_song)

similar_songs =  list(enumerate(cosine_sim[song_index]))

similar_songs_sorted = sorted(similar_songs,key=lambda x:x[1],reverse=True)[4:]

i=0
print("Top 10 similar songs to "+ user_choice_song +" are:\n")
for element in similar_songs_sorted:
    print(i+1,'->',fetch_song_from_index_2(element[0])+" By "+ fetch_artist_from_index_2(element[0]))
    print("     Similiarity score:  %.4f" % similar_songs_sorted[i][1])
    i=i+1
    if i>=10:
        break

Top 10 similar songs to Everybody Wants To Rule The World are:

1 -> High Life By Manic Drive
     Similiarity score:  0.1907
2 -> Rolling in the Deep By Adele
     Similiarity score:  0.1907
3 -> Hurt Like That By Katelyn Tarver
     Similiarity score:  0.1741
4 -> I Like Me Better By Lauv
     Similiarity score:  0.1741
5 -> Touch By Sleeping At Last
     Similiarity score:  0.1612
6 -> Black Hole - Acoustic Version By Griff
     Similiarity score:  0.1005
7 -> I Like That By Bazzi
     Similiarity score:  0.1005
8 -> My Universe By Coldplay
     Similiarity score:  0.1005
9 -> Queen (feat. Quinn XCII) By ayokay
     Similiarity score:  0.1005
10 -> 3 Nights By Dominic Fike
     Similiarity score:  0.0953


features = ['instrumentalness','acousticness','danceability','energy','liveness','loudness','speechiness', 'tempo','valence']

def combine_all_features(row):
    return str(row['instrumentalness']) +" "+ str(row['acousticness']) +" "+ str(row['danceability'])+" "+ str(row["energy"])+" "+str(row["liveness"])+" "+str(row["loudness"])+" "+str(row["speechiness"])+" "+str(row["tempo"])+" "+str(row["valence"])

df['combined_features'] = df.apply(combine_all_features,axis=1)

# Initialize tfidf vectorizer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# Fit and transform 
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix)

# cv = CountVectorizer()
# count_matrix = cv.fit_transform(df['combined_features'])
# cosine_sim = cosine_similarity(count_matrix)

def fetch_song_from_index(index):
    return df[df.index == index]["track"].values[0]

def fetch_artist_from_index(index):
    return df[df.index == index]["artist"].values[0]

def fetch_index_from_mood(mood):
    return df[df.mood == mood].index.values[0]

user_choice_mood = "Sad"

song_index = fetch_index_from_mood(user_choice_mood)

similar_songs =  list(enumerate(cosine_sim[song_index]))

similar_songs_sorted = sorted(similar_songs,key=lambda x: x[1],reverse=True)[1:]

i=0
print("Top 10 similar songs that match the "+ user_choice_mood +" mood of your songs:\n")
for element in similar_songs_sorted:
    print(i+1,'->',fetch_song_from_index(element[0])+" By "+ fetch_artist_from_index(element[0]))
    print("     Similiarity score: %.4f" % similar_songs_sorted[i][1])
    i=i+1
    if i>=10:
        break

Top 10 similar songs that match the Sad mood of your songs:

1 -> Like I Did By JC Stewart
     Similiarity score: 1.0000
2 -> Moral of the Story By Ashe
     Similiarity score: 0.1860
3 -> Neva Cared By Polo G
     Similiarity score: 0.1787
4 -> Superhero By Hayd
     Similiarity score: 0.1461
5 -> Overpass Graffiti By Ed Sheeran
     Similiarity score: 0.1444
6 -> Ain't No Mountain High Enough - Stereo Version By Marvin Gaye
     Similiarity score: 0.1443
7 -> One Day By Tate McRae
     Similiarity score: 0.1424
8 -> Feeling Whitney By Post Malone
     Similiarity score: 0.1374
9 -> Gimme! Gimme! Gimme! (A Man After Midnight) By ABBA
     Similiarity score: 0.1370
10 -> Falling Up By Dean Lewis
     Similiarity score: 0.1332

	energy
artist
City Morgue	0.849840
Katy Perry	0.782824
Lady Gaga	0.755636
Little Mix	0.750118
$uicideboy$	0.742516
Maroon 5	0.727786
One Direction	0.727706
Dua Lipa	0.722000
Shawn Mendes	0.695789
Rihanna	0.645053
Ariana Grande	0.637375
Taylor Swift	0.609824
Various Artists	0.553193
Oceans Ahead	0.539615
Ed Sheeran	0.535704
Adele	0.406000
Sam Smith	0.361545
Billie Eilish	0.288011

	loudness
artist
Little Mix	0.783266
Katy Perry	0.780845
One Direction	0.768305
Dua Lipa	0.766334
Lady Gaga	0.763828
City Morgue	0.757022
$uicideboy$	0.741045
Shawn Mendes	0.736193
Maroon 5	0.732349
Rihanna	0.726620
Ariana Grande	0.718249
Ed Sheeran	0.675858
Oceans Ahead	0.659323
Adele	0.655346
Sam Smith	0.639838
Various Artists	0.619837
Taylor Swift	0.613379
Billie Eilish	0.446614

🎶 Spotify Recommendation Playlist Based on the Emotion of Users Most Recently Played Songs - (DSI Capstone Project)¶

Background¶

Spotify Audio Features:¶

Lyrics:¶

Importing required libaries¶

Loading datasets¶

Audio feature analysis¶

NLP on lyrics¶

Predict value of moods for user¶

Recommending Songs to the User¶

Conclusion¶

	mood	sentiment	v_sentiment	count
0	Angry	negative	negative	10
1	Angry	negative	neutral	0
2	Angry	negative	positive	2
3	Angry	neutral	negative	258
4	Angry	neutral	neutral	17
5	Angry	neutral	positive	143
6	Angry	positive	negative	3
7	Angry	positive	neutral	2
8	Angry	positive	positive	24
9	Calm	negative	negative	4
10	Calm	negative	neutral	0
11	Calm	negative	positive	1
12	Calm	neutral	negative	102
13	Calm	neutral	neutral	30
14	Calm	neutral	positive	230
15	Calm	positive	negative	2
16	Calm	positive	neutral	0
17	Calm	positive	positive	40
18	Energy	negative	negative	8
19	Energy	negative	neutral	0
20	Energy	negative	positive	2
21	Energy	neutral	negative	101
22	Energy	neutral	neutral	25
23	Energy	neutral	positive	279
24	Energy	positive	negative	5
25	Energy	positive	neutral	1
26	Energy	positive	positive	78
27	Happy	negative	negative	11
28	Happy	negative	neutral	0
29	Happy	negative	positive	0
30	Happy	neutral	negative	92
31	Happy	neutral	neutral	27
32	Happy	neutral	positive	297
33	Happy	positive	negative	4
34	Happy	positive	neutral	0
35	Happy	positive	positive	88
36	Sad	negative	negative	8
37	Sad	negative	neutral	1
38	Sad	negative	positive	2
39	Sad	neutral	negative	134
40	Sad	neutral	neutral	23
41	Sad	neutral	positive	234
42	Sad	positive	negative	3
43	Sad	positive	neutral	0
44	Sad	positive	positive	47