# import tayor_swift_spotify.csv with pandas
import pandas as pd

# read csv file
df = pd.read_csv('taylor_swift_spotify.csv', index_col=0)


df.head()


df.tail()


# how many columns are there?
print('there are', df.shape[1], 'columns')

# the columns are?
print('the columns are:', df.columns.values)

there are 17 columns
the columns are: ['name' 'album' 'release_date' 'track_number' 'id' 'uri' 'acousticness'
 'danceability' 'energy' 'instrumentalness' 'liveness' 'loudness'
 'speechiness' 'tempo' 'valence' 'popularity' 'duration_ms']


# let's create a new dataframe that has only metadata, so the first 5 columns
df_metadata = df.iloc[:, :5]

df_metadata.head()


# let's create a new dataframe that has only song characteristics, so the last 11 columns
df_characteristics = df.iloc[:, 6:]
df_characteristics.head()


# you can also do it like this and creating more sub-df in one line
df_metadata, df_characteristics = df.iloc[:, :5], df.iloc[:, 6:]


# print the unique values of the column 'album'
for album in df_metadata['album'].unique():
    print(album)

1989 (Taylor's Version) [Deluxe]
1989 (Taylor's Version)
Speak Now (Taylor's Version)
Midnights (The Til Dawn Edition)
Midnights (3am Edition)
Midnights
Red (Taylor's Version)
Fearless (Taylor's Version)
evermore (deluxe version)
evermore
folklore: the long pond studio sessions (from the Disney+ special) [deluxe edition]
folklore (deluxe version)
folklore
Lover
reputation
reputation Stadium Tour Surprise Song Playlist
1989 (Deluxe Edition)
1989
Red (Deluxe Edition)
Red
Speak Now World Tour Live
Speak Now (Deluxe Edition)
Speak Now
Fearless Platinum Edition
Fearless
Live From Clear Channel Stripped 2008
Taylor Swift


# sort the dataframe by the column 'album' and year
df_metadata.sort_values(by=['album', 'release_date'], inplace=True)


df_metadata


# print the albums now
for album in df_metadata['album'].unique():
    print(album)

1989
1989 (Deluxe Edition)
1989 (Taylor's Version)
1989 (Taylor's Version) [Deluxe]
Fearless
Fearless (Taylor's Version)
Fearless Platinum Edition
Live From Clear Channel Stripped 2008
Lover
Midnights
Midnights (3am Edition)
Midnights (The Til Dawn Edition)
Red
Red (Deluxe Edition)
Red (Taylor's Version)
Speak Now
Speak Now (Deluxe Edition)
Speak Now (Taylor's Version)
Speak Now World Tour Live
Taylor Swift
evermore
evermore (deluxe version)
folklore
folklore (deluxe version)
folklore: the long pond studio sessions (from the Disney+ special) [deluxe edition]
reputation
reputation Stadium Tour Surprise Song Playlist


# now let's find the unique values of the column albums
# let's write a code that read the list that you have just printed, read the first word of the list and save the last album that starts with that word. 
# Then, the algorith restart from the first album that does not start with that word, and so on.

albums = []

first_word = df_metadata['album'].unique()[0].split()[0]
for i, album in enumerate(df_metadata['album'].unique()):
    # if the album does not start with the first word of the album name save the previous album
    if not album.startswith(first_word):
        albums.append(df_metadata['album'].unique()[i-1])
        # update the first word
        first_word = album.split()[0]


albums

["1989 (Taylor's Version) [Deluxe]",
 'Fearless Platinum Edition',
 'Live From Clear Channel Stripped 2008',
 'Lover',
 'Midnights (The Til Dawn Edition)',
 "Red (Taylor's Version)",
 'Speak Now World Tour Live',
 'Taylor Swift',
 'evermore (deluxe version)',
 'folklore: the long pond studio sessions (from the Disney+ special) [deluxe edition]']


# now let's creat a sub-df that contains songs only from the albums that we have just found
df_metadata_unique_albums = df_metadata[df_metadata['album'].isin(albums)]

df


df_metadata_unique_albums.head(25)


# check if the albums are correct
for album in df_metadata_unique_albums['album'].unique():
    print(album)

1989 (Taylor's Version) [Deluxe]
Fearless Platinum Edition
Live From Clear Channel Stripped 2008
Lover
Midnights (The Til Dawn Edition)
Red (Taylor's Version)
Speak Now World Tour Live
Taylor Swift
evermore (deluxe version)
folklore: the long pond studio sessions (from the Disney+ special) [deluxe edition]


# Let's define energy as the sum of the squares of "acousticness" and "danceability" under square root.
# create a new column called 'energy' that contains the energy of each song, use apply() and lambda function
df_characteristics['popness'] = df_characteristics.apply(lambda row: (row['acousticness']**2 + row['danceability']**2)**0.5, axis=1)

df_characteristics.head()


# another way to do it is to use the .apply() method without lambda function
def popness(row):
    '''
    This function returns the energy of a song
    
    Parameters:
    -----------
    row: pandas.Series
        a row of a pandas dataframe
        
        
    Returns:
    -------
    float
    the energy of a song
                
    '''
    return (row['acousticness']**2 + row['danceability']**2)**0.5


df_characteristics['popness'] = df_characteristics.apply(popness, axis=1)


df_characteristics.head()


# can you move the column 'energy' to the last column of the dataframe?
cols = list(df_characteristics.columns.values)
print(cols)

cols.insert(12, cols.pop(cols.index('energy')))
print(cols)

df_characteristics = df_characteristics[cols]

['acousticness', 'danceability', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'duration_ms', 'popness', 'energy']
['acousticness', 'danceability', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'duration_ms', 'popness', 'energy']


df_characteristics.head()


# from this let's drop popularity and duration_ms
df_characteristics.drop(['popularity', 'duration_ms'], axis=1, inplace=True)
df_characteristics.head()


df_characteristics['energy'] = df_characteristics['Energy']


# merge df_metadata and df_characteristics
new_df = pd.merge(df_metadata_unique_albums, df_characteristics, left_index=True, right_index=True)
new_df


# Print the shape of the three dataframes
print('df shape:', df.shape)
print('df_metadata unique albums shape:', df_metadata_unique_albums.shape)
print('df_characteristics shape:', df_characteristics.shape)

print('new_df shape:', new_df.shape)

df shape: (530, 17)
df_metadata unique albums shape: (202, 5)
df_characteristics shape: (530, 11)
new_df shape: (202, 16)


new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 0 to 242
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              202 non-null    object 
 1   album             202 non-null    object 
 2   release_date      202 non-null    object 
 3   track_number      202 non-null    int64  
 4   id                202 non-null    object 
 5   acousticness      202 non-null    float64
 6   danceability      202 non-null    float64
 7   instrumentalness  202 non-null    float64
 8   liveness          202 non-null    float64
 9   loudness          202 non-null    float64
 10  speechiness       202 non-null    float64
 11  tempo             202 non-null    float64
 12  valence           202 non-null    float64
 13  popness           202 non-null    float64
 14  energy            202 non-null    float64
dtypes: float64(10), int64(1), object(4)
memory usage: 25.2+ KB


df_metadata_unique_albums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 0 to 242
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          202 non-null    object
 1   album         202 non-null    object
 2   release_date  202 non-null    object
 3   track_number  202 non-null    int64 
 4   id            202 non-null    object
dtypes: int64(1), object(4)
memory usage: 17.6+ KB


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 530 entries, 0 to 529
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              530 non-null    object 
 1   album             530 non-null    object 
 2   release_date      530 non-null    object 
 3   track_number      530 non-null    int64  
 4   id                530 non-null    object 
 5   uri               530 non-null    object 
 6   acousticness      530 non-null    float64
 7   danceability      530 non-null    float64
 8   energy            530 non-null    float64
 9   instrumentalness  530 non-null    float64
 10  liveness          530 non-null    float64
 11  loudness          530 non-null    float64
 12  speechiness       530 non-null    float64
 13  tempo             530 non-null    float64
 14  valence           530 non-null    float64
 15  popularity        530 non-null    int64  
 16  duration_ms       530 non-null    int64  
dtypes: float64(9), int64(3), object(5)
memory usage: 90.7+ KB


# how do I check if I merged the dataframes correctly?

# for every row in new_df, check if the values of the columns  are the same as the values in df

# columns in common between new_df and df
common_columns = new_df.columns.intersection(df.columns)
common_columns

Index(['name', 'album', 'release_date', 'track_number', 'id', 'acousticness',
       'danceability', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'energy'],
      dtype='object')


for i in new_df.index.values:
    for col in common_columns:

        if new_df[col][i] != df[col][i]:
            print('row', i, 'column', col, 'is different') 
            print('new_df:', new_df[col][i])     
            print('df:', df[col][i])

	name	album	release_date	track_number	id	uri	acousticness	danceability	energy	instrumentalness	liveness	loudness	speechiness	tempo	valence	popularity	duration_ms
0	Welcome To New York (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	1	4WUepByoeqcedHoYhSNHRt	spotify:track:4WUepByoeqcedHoYhSNHRt	0.009420	0.757	0.610	0.000037	0.3670	-4.840	0.0327	116.998	0.685	72	212600
1	Blank Space (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	2	0108kcWLnn2HlH2kedi1gn	spotify:track:0108kcWLnn2HlH2kedi1gn	0.088500	0.733	0.733	0.000000	0.1680	-5.376	0.0670	96.057	0.701	73	231833
2	Style (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	3	3Vpk1hfMAQme8VJ0SNRSkd	spotify:track:3Vpk1hfMAQme8VJ0SNRSkd	0.000421	0.511	0.822	0.019700	0.0899	-4.785	0.0397	94.868	0.305	74	231000
3	Out Of The Woods (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	4	1OcSfkeCg9hRC2sFKB4IMJ	spotify:track:1OcSfkeCg9hRC2sFKB4IMJ	0.000537	0.545	0.885	0.000056	0.3850	-5.968	0.0447	92.021	0.206	73	235800
4	All You Had To Do Was Stay (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	5	2k0ZEeAqzvYMcx9Qt5aClQ	spotify:track:2k0ZEeAqzvYMcx9Qt5aClQ	0.000656	0.588	0.721	0.000000	0.1310	-5.579	0.0317	96.997	0.520	72	193289

	name	album	release_date	track_number	id	uri	acousticness	danceability	energy	instrumentalness	liveness	loudness	speechiness	tempo	valence	popularity	duration_ms
525	Our Song	Taylor Swift	2006-10-24	11	15DeqWWQB4dcEWzJg15VrN	spotify:track:15DeqWWQB4dcEWzJg15VrN	0.11100	0.668	0.672	0.000000	0.3290	-4.931	0.0303	89.011	0.539	74	201106
526	I'm Only Me When I'm With You	Taylor Swift	2006-10-24	12	0JIdBrXGSJXS72zjF9ss9u	spotify:track:0JIdBrXGSJXS72zjF9ss9u	0.00452	0.563	0.934	0.000807	0.1030	-3.629	0.0646	143.964	0.518	59	213053
527	Invisible	Taylor Swift	2006-10-24	13	5OOd01o2YS1QFwdpVLds3r	spotify:track:5OOd01o2YS1QFwdpVLds3r	0.63700	0.612	0.394	0.000000	0.1470	-5.723	0.0243	96.001	0.233	56	203226
528	A Perfectly Good Heart	Taylor Swift	2006-10-24	14	1spLfUJxtyVyiKKTegQ2r4	spotify:track:1spLfUJxtyVyiKKTegQ2r4	0.00349	0.483	0.751	0.000000	0.1280	-5.726	0.0365	156.092	0.268	54	220146
529	Teardrops on My Guitar - Pop Version	Taylor Swift	2006-10-24	15	4pJi1rVt9GNegU9kywjg4z	spotify:track:4pJi1rVt9GNegU9kywjg4z	0.04020	0.459	0.753	0.000000	0.0863	-3.827	0.0537	199.997	0.483	55	179066

	name	album	release_date	track_number	id
0	Welcome To New York (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	1	4WUepByoeqcedHoYhSNHRt
1	Blank Space (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	2	0108kcWLnn2HlH2kedi1gn
2	Style (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	3	3Vpk1hfMAQme8VJ0SNRSkd
3	Out Of The Woods (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	4	1OcSfkeCg9hRC2sFKB4IMJ
4	All You Had To Do Was Stay (Taylor's Version)	1989 (Taylor's Version) [Deluxe]	2023-10-27	5	2k0ZEeAqzvYMcx9Qt5aClQ

	acousticness	danceability	energy	instrumentalness	liveness	loudness	speechiness	tempo	valence	popularity	duration_ms
0	0.009420	0.757	0.610	0.000037	0.3670	-4.840	0.0327	116.998	0.685	72	212600
1	0.088500	0.733	0.733	0.000000	0.1680	-5.376	0.0670	96.057	0.701	73	231833
2	0.000421	0.511	0.822	0.019700	0.0899	-4.785	0.0397	94.868	0.305	74	231000
3	0.000537	0.545	0.885	0.000056	0.3850	-5.968	0.0447	92.021	0.206	73	235800
4	0.000656	0.588	0.721	0.000000	0.1310	-5.579	0.0317	96.997	0.520	72	193289

	name	album	release_date	track_number	id
374	Welcome To New York	1989	2014-10-27	1	6qnM0XXPZOINWA778uNqQ9
375	Blank Space	1989	2014-10-27	2	1p80LdxRV74UKvL8gnD7ky
376	Style	1989	2014-10-27	3	4lIxdJw6W3Fg4vUIYCB0S5
377	Out Of The Woods	1989	2014-10-27	4	5OndtwLGA9O6XHFcGm2H7r
378	All You Had To Do Was Stay	1989	2014-10-27	5	0dAb8TY433dl3ZfXYCLE19
...	...	...	...	...	...
350	Wonderland	reputation Stadium Tour Surprise Song Playlist	2017-11-09	42	0eTCaVOYLpJlSkbEi2jay4
351	White Horse	reputation Stadium Tour Surprise Song Playlist	2017-11-09	43	4RMfSYDFkcz0bKoK8fHDgR
352	I'm Only Me When I'm With You	reputation Stadium Tour Surprise Song Playlist	2017-11-09	44	6Py8IKZqDyU8aQVrjilbJw
353	Starlight	reputation Stadium Tour Surprise Song Playlist	2017-11-09	45	7txpzmA69zyk50K3ZyZSzC
354	I Know Places	reputation Stadium Tour Surprise Song Playlist	2017-11-09	46	3qGLcmO0XnnrcpAQGZwTvA

Advanced Data Manipulation with Pandas¶

Split the dataframe¶

Metadata df¶

Excercise¶

Modify columns, and apply functions¶

Merging dataframes¶