# Set-up parametersthreshold_year<-2019# Exclude older years from some anlaysistop_artist<-"Taylor Swift"# Exclude one artist from some analysissave_csv <-TRUE# Export stream data frame as csvsave_svg <-TRUE# Export ggplots as svg files# Required librarieslibrary(jsonlite) # Working with JSON datalibrary(dplyr) # Data transformationlibrary(tidyr) # Data cleaninglibrary(lubridate) # Handling dates and timeslibrary(ggplot2) # Plot graphslibrary(knitr) # Report formattingif(save_svg) library(svglite) # Create SVG files# Colour palettepal<-c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")
Data preparation
Code
# Find json streaming history filesstream_files<-list.files("input_files/", pattern="Streaming_History_Audio")#Merge all streaming history files into a data frame#Read and parse playlist data from the fist JSON filestream<-fromJSON(paste0("input_files/",stream_files[1]), flatten =TRUE)#loop allfor(i in2:length(stream_files)){ stream_i <-fromJSON(paste0("input_files/",stream_files[i]), flatten =TRUE) stream<-merge(stream, stream_i, all =TRUE)}
# Calculate total listening hours by yearby_year <- stream %>%group_by(year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60,n_tracks=n(),n_songs=n_distinct(n_tracks),n_artists=n_distinct(artist_name)) # Display tableby_year%>%transmute(Year=year,`Hours played`=round(h_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`N artists`= n_artists )%>%kable(caption="Spotify listened by year")
Spotify listened by year
Year
Hours played
N tracks
N songs
N artists
2014
6.20
736
1
455
2015
32.38
3682
1
1576
2016
81.11
2717
1
1218
2017
212.79
4882
1
1338
2018
363.37
9056
1
1820
2019
381.43
7731
1
1564
2020
670.41
13860
1
2519
2021
838.28
19114
1
2530
2022
808.95
24118
1
3197
2023
1161.05
29850
1
2690
2024
1141.30
31826
1
3102
By device
Code
# Calculate total listening hours by year and deviceby_device <- stream %>%group_by(Device, year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60) # Create stacked bar chart of listening hours by device and yearplot_device<-by_device%>%ggplot(aes(x = year, y = h_played, fill = Device)) +geom_col(position =position_stack(reverse =TRUE)) +# Stack bars with reverse orderscale_fill_manual(values = pal, na.value ="grey80") +# Custom color palettetheme_minimal() +ggtitle("Hours listening to Spotify") +ylab("")# Display plotplot_device
Code
# Save plotif(save_svg) ggsave(paste0("plot_device",".svg"), path ="output_files/")
Shuffle mode
Code
# Calculate tracks played in shuffle mode by yearby_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date)), shuffle) %>%summarise(n =n()) %>%mutate(`Proportion of tracks`= n /sum(n),Shuffle =ifelse(shuffle, "Yes", "No"))# Create stacked bar chart of tracks played in shuffle mode by yearplot_shuffle<-by_shuffle%>%ggplot(aes(x = year, y =`Proportion of tracks`, fill=Shuffle)) +geom_col() +scale_fill_manual(values = pal, na.value ="grey80") +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +# Convert y-axis to percentagesggtitle("Proportion of tracks listened in shuffle mode") +ylab("")# Display plotplot_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_shuffle",".svg"), path ="output_files/")
Track start and end
Code
# Create consistent palettes for track start and end visualization# Sort unique end and start valuesnames_end <-sort(unique(stream$End), decreasing =TRUE)names_start <-sort(unique(stream$Start), decreasing =TRUE)# Create palette for end valuespal_end <- pal[1:length(names_end)]names(pal_end) <- names_end# Find overlapping colors between start and endpal_start_end <- pal_end[names_end %in% names_start]# Create palette for start valuespal_start <- pal[!pal %in% pal_start_end]names(pal_start) <- names_start[!names_start %in% names_end]# Combine palettespal_start <-c(pal_start_end, pal_start)
Code
# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)by_reason_start_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`Start`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))plot_reason_start_shuffle<-by_reason_start_shuffle%>%ggplot(aes(x = year, y = n, fill=`Start`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_start) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks started") +ylab("Proportion of tracks") +theme(legend.title=element_blank()) # Display plotplot_reason_start_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path ="output_files/")
Code
# Calculate number of tracks listenede by year and suffle modeby_reason_end_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`End`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n)) # Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)plot_reason_end_shuffle<-by_reason_end_shuffle%>%ggplot(aes(x = year, y = n, fill=`End`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_end) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks ended") +ylab("Proportion of tracks") +theme(legend.title=element_blank()) # Display plotplot_reason_end_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path ="output_files/")
Listening peaks
Hours per day calendar
Code
# Create a dataframe with all days in the date rangeall_days <-data.frame(day_date =seq(as.Date(floor_date(min(stream$date), unit="year")),as.Date(ceiling_date(max(stream$date), unit="year")-1), by="days")) %>%mutate(day_n =row_number())# Calculate first weekday of each yearfirst_week_day <- all_days %>%mutate(weekday =as.POSIXlt(day_date)$wday,weekday =ifelse(weekday==0, 7, weekday),year =year(day_date)) %>%group_by(year) %>%summarize(first_week_day =first(weekday)-1,days_year =n())# Process streaming data by dayby_day_year <- stream %>%mutate(day_date =date(stream$date)) %>%right_join(all_days) %>%group_by(day_date, day_n) %>%# Calculate daily metricssummarise(h_played =sum(ms_played)/1000/60/60, # Convert ms to hoursn_tracks =n(),computer =mean(Device=="Computer", na.rm=TRUE),shuffle =mean(shuffle, na.rm=TRUE),incognito_mode =mean(incognito_mode, na.rm=TRUE),offline =mean(offline, na.rm=TRUE),h_top_artist =sum(ifelse(grepl(top_artist,artist_name), h_played, 0), na.rm=TRUE) ) %>%# Add calendar columnsmutate(h_played =ifelse(is.na(h_played), 0, h_played),year =year(day_date),month =month(day_date),week =week(day_date),day_year =yday(day_date),weekday =as.POSIXlt(day_date)$wday,weekday =factor(ifelse(weekday==0, 7, weekday), levels=c(1:7)) ) %>%left_join(first_week_day) %>%mutate(calendar_row =ceiling((first_week_day + day_year)/7) )# Create calendar heatmapplot_day_year<-by_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=h_played)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +ggtitle("Hours listened per day") +theme(legend.title=element_blank()) +ylab("")# Display plotplot_day_year
Code
# Save plot if save_svg is TRUEif(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")
Other calendars
Code
# Other calendars - not included in the automatic report# Plot for computer usageby_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=computer)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +ggtitle("% of tracks listened on the computer per day") +ylab("")
Code
# Plot for incognito mode usageby_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=incognito_mode)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +ggtitle("% of tracks listened in incognito mode") +ylab("")
Code
# Plot for offline modeby_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=offline)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +ggtitle("% of tracks listened offline") +ylab("")
Code
# Plot for top artist listening patternsby_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=h_top_artist)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +ggtitle(paste0("Hours listened to ",top_artist," per day")) +ylab("")
Top days
Code
# Create summary of number of times a track was listened in a dayby_top_track_day <- stream %>%mutate(day_date =date(stream$date)) %>%group_by(track_name, day_date) %>%summarise(n_tracks =n())%>%ungroup() %>%arrange(desc(n_tracks))# Get top 10 days by hours playedby_top_day <- stream %>%mutate(day_date =date(stream$date)) %>%group_by(day_date) %>%summarise(n_tracks_all =n(),computer =mean(Device =="Computer"),shuffle =mean(shuffle),h_played =sum(ms_played)/1000/60/60 ) %>%ungroup() %>%arrange(desc(h_played)) %>%slice_max(h_played, n =10)# Join the data and calculate final metricsby_top_day <-by_top_day %>%left_join(by_top_track_day) %>%group_by(day_date) %>%summarize(mult_times =mean(n_tracks >1)) %>%left_join(by_top_day) %>%arrange(desc(h_played)) %>%relocate(day_date, h_played, n_tracks = n_tracks_all)# Display tableby_top_day%>%transmute(Date=day_date,Hours=round(h_played,2),`N tracks`=n_tracks,`% tracks replayed`=paste0(round(mult_times,4)*100, "%"),`% tracks on the computer`=paste0(round(computer,4)*100, "%"),`% tracks on shuffle`=paste0(round(computer,4)*100, "%"))%>%kable(caption="Top 10 days by listening hours")
Top 10 days by listening hours
Date
Hours
N tracks
% tracks replayed
% tracks on the computer
% tracks on shuffle
2021-03-11
15.59
223
33.94%
100%
100%
2019-02-03
13.76
276
40.1%
100%
100%
2021-03-23
12.27
199
8.74%
100%
100%
2021-01-03
11.83
264
38.1%
100%
100%
2019-01-06
11.68
223
49.65%
99.55%
99.55%
2020-06-15
11.59
209
4.52%
100%
100%
2021-10-09
11.43
285
4.01%
95.44%
95.44%
2019-06-23
11.43
191
37.78%
64.4%
64.4%
2024-09-19
11.26
279
2.96%
72.04%
72.04%
2024-10-30
11.05
227
8.02%
72.25%
72.25%
Minutes listened by hour of the day
Code
# Create a dataframe with all possible hours for each dayall_hours <- all_days %>%expand(day_date, hour=1:24)# Calculate average listening time by hour and yearby_hour_year <- stream %>%mutate(day_date =date(stream$date),hour =hour(date)) %>%right_join(all_hours) %>%group_by(day_date, hour) %>%mutate(ms_played =ifelse(is.na(ms_played), 0, ms_played)) %>%summarise(min_played =sum(ms_played/1000/60)) %>%group_by(year =year(day_date), hour) %>%summarise(min_played =mean(min_played)) # Create visualization of average listening time by hour and yearplot_hour_year <- by_hour_year %>%ggplot(aes(x = year, y = hour, fill = min_played)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(hour), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Minutes listened per hour of the day (on average)") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank())# Display plotplot_hour_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_hour_year",".svg"), width =7, height =5, path ="output_files/")
What have I listened to in Spotify?
Top artists
Code
# Calculate statistics by yearby_year <- stream %>%group_by(year =year(date)) %>%summarise(min_played_year =sum(ms_played)/1000/60,n_tracks_year =n(),n_songs_year =n_distinct(track_name) )# Calculate statistics by artist and yearby_artist_year <- stream %>%group_by(artist_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n(),n_songs =n_distinct(track_name),p_tracks =n()/sum(n_tracks) ) %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name, ", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%left_join(by_year) %>%filter(!is.na(artist_name)) %>%group_by(artist_name, year) %>%summarise(min_played =sum(min_played),n_tracks =sum(n_tracks),n_songs =sum(n_songs),p_tracks = n_tracks/n_tracks_year,p_min_played = min_played/min_played_year,p_songs = n_songs/n_songs_year ) %>%arrange(year,desc(n_tracks))by_artist_year %>%group_by(Year = year) %>%slice_max(n_tracks, n =3)%>%transmute(Artist = artist_name,`Minutes played`=round(min_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`% of the annual time`=paste0(round(p_min_played,4)*100, "%"),`% of the annual tracks`=paste0(round(p_tracks,4)*100, "%"),`% of the annual songs`=paste0(round(p_songs,4)*100, "%"), )%>%kable(caption ="Top 3 artists by year")
Top 3 artists by year
Year
Artist
Minutes played
N tracks
N songs
% of the annual time
% of the annual tracks
% of the annual songs
2014
Glee Cast
2.78
16
16
0.75%
2.17%
2.59%
2014
Queen
3.51
13
8
0.94%
1.77%
1.29%
2014
Katy Perry
3.47
9
9
0.93%
1.22%
1.46%
2014
Muse
3.08
9
5
0.83%
1.22%
0.81%
2015
Maroon 5
18.71
26
8
0.96%
0.71%
0.35%
2015
Adele
23.30
25
8
1.2%
0.68%
0.35%
2015
Berlin
1.39
25
23
0.07%
0.68%
1.01%
2016
Video Game Players
37.51
55
32
0.77%
2.02%
1.81%
2016
Rihanna
59.91
40
15
1.23%
1.47%
0.85%
2016
P!nk
122.44
39
21
2.52%
1.44%
1.19%
2017
Beyoncé
256.46
81
11
2.01%
1.66%
0.51%
2017
Katy Perry
201.12
71
12
1.58%
1.45%
0.56%
2017
Fifth Harmony
199.88
67
8
1.57%
1.37%
0.37%
2018
Nate Fifield
146.91
139
30
0.67%
1.53%
1%
2018
Britney Spears
273.43
120
7
1.25%
1.33%
0.23%
2018
Katy Perry
229.32
99
11
1.05%
1.09%
0.37%
2019
Taylor Swift
530.63
182
32
2.32%
2.35%
1.16%
2019
Katy Perry
239.93
72
16
1.05%
0.93%
0.58%
2019
Belinda Carlisle
281.75
70
3
1.23%
0.91%
0.11%
2020
Taylor Swift
1519.54
499
89
3.78%
3.6%
1.86%
2020
Lady Gaga
334.64
106
21
0.83%
0.76%
0.44%
2020
Katy Perry
307.91
100
11
0.77%
0.72%
0.23%
2021
Taylor Swift
2882.50
916
181
5.73%
4.79%
3.23%
2021
La Oreja de Van Gogh
1088.05
385
57
2.16%
2.01%
1.02%
2021
Lofi Fruits Music
594.88
355
146
1.18%
1.86%
2.61%
2022
Taylor Swift
3025.07
1140
175
6.23%
4.73%
2.88%
2022
La Oreja de Van Gogh
758.02
315
48
1.56%
1.31%
0.79%
2022
Britney Spears
561.11
234
16
1.16%
0.97%
0.26%
2023
Taylor Swift
5558.48
1974
213
7.98%
6.61%
3.71%
2023
Britney Spears
1197.98
446
30
1.72%
1.49%
0.52%
2023
Miley Cyrus
1116.70
415
35
1.6%
1.39%
0.61%
2024
Chappell Roan
1584.05
605
17
2.31%
1.9%
0.25%
2024
Taylor Swift
1469.42
605
112
2.15%
1.9%
1.65%
2024
Charli xcx
822.99
420
47
1.2%
1.32%
0.69%
Code
# Get top 40 artists by yeartop_artist_year <- by_artist_year %>%ungroup() %>%slice_max(n_tracks, n =40) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played),n_songs_all=sum(n_songs)) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%distinct()# Calculate statistics by artistby_artist_all <- stream %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name, ", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%group_by(artist_name) %>%summarise(min_played_all =sum(ms_played/1000/60),n_tracks_all =n(),n_songs_all =n_distinct(track_name),p_tracks_all =n()/sum(n()) ) %>%filter(!is.na(artist_name)) %>%arrange(desc(n_tracks_all))# Get top 40 artists overalltop_artist_all<-by_artist_all %>%slice_max(n_tracks_all, n =40) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all)# Display tabletop_artist_all[1:10,]%>%transmute(Artist=artist_name,`Minutes played`=round(min_played_all,2),`N tracks`= n_tracks_all,`N songs`= n_songs_all)%>%kable(caption ="Top 10 artists (all years)")
Top 10 artists (all years)
Artist
Minutes played
N tracks
N songs
Taylor Swift
15132.69
5376
327
Britney Spears
3653.19
1422
54
Lady Gaga
3033.80
1148
60
La Oreja de Van Gogh
2934.66
1139
84
Miley Cyrus
2867.66
1103
60
Rihanna
2829.78
955
46
Beyoncé
2423.74
907
82
Katy Perry
2320.41
896
39
Shakira
1929.30
871
54
Charli xcx
1720.10
830
58
Code
# Get top 40 artists overall or by yeartop_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))# Join top artists data and filter top artistsby_top_artist <- by_artist_year %>%filter(artist_name%in%top_artist_global) %>%left_join(by_artist_all) %>%arrange(desc(n_tracks))
Global - excluding 1 artist
Code
# Create heatmap excluding top artistplot_top_artist_year_filter<-by_top_artist%>%filter(!artist_name == top_artist) %>%ggplot(aes(x = year, y =reorder(artist_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(artist_name, -n_tracks_all)), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle(paste0("Times listened to most played artists (excluding ", top_artist,")")) +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank() )# Display plotplot_top_artist_year_filter
Code
# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year_filter",".svg"), width =7, height =9, path ="output_files/")
Global
Code
# Create a heatmap (not filtering top_artist)plot_top_artist_year<-by_top_artist%>%ggplot(aes(x = year, y =reorder(artist_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(artist_name, -n_tracks_all)), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank())# Display plotplot_top_artist_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width =7, height =9, path ="output_files/")
Global + Top 10 by year
Code
# Get top 10 artists for each yeartop_artist_year_partial <- by_artist_year %>%group_by(year) %>%slice_max(n_tracks, n =10) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%select(artist_name) %>%distinct()# Combine different top artist liststop_artist_all <- by_artist_year %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%slice_max(n_tracks_all, n =40) %>%select(artist_name, n_tracks_all) %>%bind_rows(top_artist_year) %>%bind_rows(top_artist_year_partial) %>%select(artist_name) %>%distinct()
Code
# Prepare data for matrix visualizationby_top_artist_matrix <- by_artist_year %>%right_join(top_artist_all) %>%select(artist_name, year, p_tracks) %>%arrange(year, desc(p_tracks))# Create wide format matrix # (this is redundant, but I was stuck geting the artist order right)matrix <- tidyr::pivot_wider(data = by_top_artist_matrix,id_cols = artist_name,names_from = year,values_from = p_tracks)# Set ordering for visualizationrow_order <- matrix$artist_namecol_order <-names(matrix)[-1]# Create heatmap visualizationplot_top_artist_matrix <- by_top_artist_matrix %>%mutate(n_row =row_number(),year =factor(year, levels = col_order),artist_name =factor(artist_name, levels = row_order) ) %>%filter(!artist_name %in% top_artist) %>%ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +geom_tile(position ="identity") +theme_void() +facet_grid(cols =vars(year), rows =vars(artist_name), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_top_artist_matrix
# Calculate total play time and count for each tracktop_track <- stream %>%group_by(track_name, artist_name) %>%summarise(min_played =sum(ms_played/1000/60), # Convert ms to minutesn_tracks =n() ) %>%arrange(desc(n_tracks))%>%ungroup()# Display tabletop_track[1:10,]%>%transmute(`Track`= track_name,`Artist`= artist_name,`Minutes played`=round(min_played,2),`N tracks`=`n_tracks` )%>%kable(caption="Top 10 tracks")
Top 10 tracks
Track
Artist
Minutes played
N tracks
I Kissed A Girl
Katy Perry
578.46
260
Party In The U.S.A.
Miley Cyrus
647.46
260
Heaven Is A Place On Earth
Belinda Carlisle
841.42
245
Toxic
Britney Spears
584.73
244
S&M
Rihanna
814.68
226
…Baby One More Time
Britney Spears
522.48
194
Wannabe
Spice Girls
373.32
183
TiK ToK
Kesha
411.41
173
Red Wine Supernova
Chappell Roan
430.19
164
If U Seek Amy
Britney Spears
423.96
163
Code
# Calculate statistics by track and yearby_track_year <- stream %>%group_by(track_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n() )# Get top 30 tracks by yeartop_track_year <- by_track_year %>%ungroup() %>%slice_max(n_tracks, n =30) %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%select(track_name, n_tracks_all, min_played_all) %>%distinct()# Get top 30 tracks overalltop_track_all <- by_track_year %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%slice_max(n_tracks_all, n =30) %>%select(track_name, n_tracks_all, min_played_all) %>%bind_rows(top_track_year) %>%distinct()# Create heatmap visualizationplot_track_year<-by_track_year %>%right_join(top_track_all) %>%arrange(n_tracks) %>%ggplot(aes(x = year, y =reorder(track_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(track_name, -n_tracks_all)), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played tracks") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank() )# Display plotplot_track_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_track_year",".svg"), width =7, height =9, path ="output_files/")
Manually selected tracks
Code
# Filter and summarize selected tracksby_track_select <- stream %>%# Get only selected tracks that have a namefilter(Start=="Selected", !is.na(track_name)) %>%group_by(track_name, artist_name) %>%summarise(n_selected =n() ) %>%ungroup() %>%# Join with top tracks dataleft_join(top_track) %>%# Calculate percentage of times track was selectedmutate(p_selected = n_selected/n_tracks ) %>%# Get top 20 most selected tracksslice_max(n_selected, n =20) %>%arrange(desc(n_selected))# Display tableby_track_select %>%transmute(`Track`= track_name,`Artist`= artist_name,`Times Selected`= n_selected,`% Selected`=paste0(round(p_selected,4)*100,"%"),`N tracks`=`n_tracks` ) %>%kable(caption="Top selected tracks")
Top selected tracks
Track
Artist
Times Selected
% Selected
N tracks
Tití Me Preguntó
Bad Bunny
43
34.68%
124
Estrella Polar (with Juan Aguirre)
Pereza
42
33.87%
124
No Sigue Modas
Juan Magán
34
45.95%
74
Don’t Blame Me
Taylor Swift
29
19.08%
152
Heaven Is A Place On Earth
Belinda Carlisle
28
11.43%
245
Al Amanecer
Los Fresones Rebeldes
27
22.69%
119
Lisa
Young Miko
27
18.12%
149
Mi Reina
Henry Mendez
27
21.09%
128
Nobody Like U
4*TOWN (From Disney and Pixar’s Turning Red)
27
45%
60
No Go Go Go!
89ers
25
34.25%
73
Red Wine Supernova
Chappell Roan
25
15.24%
164
Right Round
The Treblemakers
25
36.76%
68
Rosas
Algo
25
21.37%
117
Sarà perché ti amo
DJ Matrix
25
28.41%
88
Shape of You
Ed Sheeran
25
29.76%
84
APRENDER A AMAR
NATHY PELUSO
24
27.59%
87
Bad Karma (feat. Joan Jett)
Miley Cyrus
24
28.57%
84
Boys
Charli xcx
24
23.53%
102
Tick Tick Boom (feat. BygTwo3)
Sage The Gemini
24
28.24%
85
Yo Solo Quiero Amor (From “Te Estoy Amando Locamente”)
Rigoberta Bandini
24
31.17%
77
In loop
Times played in one day
Code
# Display tableby_top_track_day[1:10,]%>%transmute(Date=day_date,Track=track_name,`N tracks`=n_tracks)%>%kable(caption="Top 10 songs by times played in one day")
Top 10 songs by times played in one day
Date
Track
N tracks
2024-06-05
La Ladrona - Remasterizado 2009
23
2024-03-29
Little Boxes
23
2023-08-10
Yo Solo Quiero Amor (From “Te Estoy Amando Locamente”)
23
2023-08-10
Hits 2000’ - Mashup
21
2024-04-14
Chicken Little
17
2023-10-26
Rikiti
17
2023-03-09
Aunque Pene
16
2023-05-15
Goodbye Earl
16
2021-10-11
Yo Invito
16
2024-05-22
APRENDER A AMAR
15
Code
# Find tracks that were played more than 5 times in a day# and count how many days this happenedby_top_track_loop<-by_top_track_day%>%filter(n_tracks>5)%>%group_by(track_name)%>%summarise(n_days=n() )%>%ungroup() %>%arrange(desc(n_days))%>%filter(n_days>2,!is.na(track_name))# Display tableby_top_track_loop%>%transmute(`Track`=track_name,`Days in loop (times played>5)`=n_days)%>%kable(caption="Tracks played more than 5 times several days")
Tracks played more than 5 times several days
Track
Days in loop (times played>5)
“Hetero”
4
APRENDER A AMAR
3
Chicken Little
3
Dios Está Aquí (Perreo Mix)
3
Like a Prayer - Live
3
Me quedaré solo
3
Niños furbito y niñas lo que sea
3
Sarà perché ti amo
3
Stupid With Love
3
Teenage Dirtbag
3
Welcome To New York (Taylor’s Version)
3
Guilty pleasures (incognito mode)
Per year
Code
# Create a plot showing proportion of tracks listened in incognito mode by yearplot_incognito <- stream %>%group_by(year =as.factor(year(date))) %>%summarise(`Proportion of tracks`=mean(incognito_mode)) %>%ggplot(aes(x = year, y =`Proportion of tracks`)) +geom_col(fill=pal[1]) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("Proportion of tracks listened in incognito mode") +ylab("")# Display plotplot_incognito
Code
# Save plotif(save_svg) ggsave(paste0("plot_incognito",".svg"), path ="output_files/")
Top incognito tracks
Playlists and Library tracks
Set up
Code
# Read and parse playlist data from JSON fileplaylist <-fromJSON("input_files/Playlist1.json", flatten =TRUE)[[1]]playlist_tracks <-data.frame()# Loop through each playlist and extract tracksfor(i in1:nrow(playlist)){ playlist_tracks_i <- playlist$items[[i]] playlist_tracks_i$playlist_name <- playlist$name[i] playlist_tracks <-bind_rows(playlist_tracks, playlist_tracks_i)}# Read library data from JSON filelibrary <-fromJSON("input_files/YourLibrary.json", flatten =TRUE)[[1]]
User playlists
Code
# Process playlist tracks dataplaylist_tracks <- playlist_tracks %>%mutate(added_date =as_datetime(addedDate, tz ="UTC"),artist_name = track.artistName,track_name = track.trackName,album_name = track.albumName )# Get most common tracks in playlistsplaylist_top_tracks <- playlist_tracks %>%group_by(track_name,artist_name) %>%summarise(n_playlists =n()) %>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_tracks[1:10,]%>%transmute(`Track`=track_name,`Artist`= artist_name,`N playlists`= n_playlists, )%>%kable(caption="Tracks included in more playlists")
Tracks included in more playlists
Track
Artist
N playlists
Heaven Is A Place On Earth
Belinda Carlisle
5
Rebel Rebel - 2016 Remaster
David Bowie
5
Ateo
C. Tangana
4
Barbie Girl
Aqua
4
Call It Fate, Call It Karma
The Strokes
4
Dulce y Bautizada
Samantha Hudson
4
El Fin del Mundo
La La Love You
4
I Follow Rivers - The Magician Remix
Lykke Li
4
I Need A Dollar
Aloe Blacc
4
Lisa
Young Miko
4
Code
# Get most common artists in playlistsplaylist_top_artist<-playlist_tracks%>%group_by(artist_name)%>%summarise(n_tracks=n(),n_songs=n_distinct(track_name),n_playlists=n_distinct(playlist_name))%>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N tracks`= n_tracks,`N songs`= n_songs,`N playlists`= n_playlists, )%>%kable(caption="Artists included in more playlists")
Artists included in more playlists
Artist
N tracks
N songs
N playlists
Taylor Swift
108
70
14
Miley Cyrus
16
10
9
Beyoncé
9
7
8
David Bowie
7
3
7
Madonna
9
9
7
Queen
9
6
7
Avril Lavigne
8
4
6
Bad Bunny
8
4
6
C. Tangana
9
4
6
La La Love You
6
3
6
User library
Code
# Get most common artists in library (saved songs)library_tracks<-library%>%mutate(artist_name = artist,track_name = track,album_name = album)library_top_artist<-library_tracks%>%group_by(artist_name)%>%summarise(n_songs=n_distinct(track_name))%>%arrange(desc(n_songs))%>%ungroup()# Display tablelibrary_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N songs`= n_songs, )%>%kable(caption="Artists with more songs in library")
Artists with more songs in library
Artist
N songs
Taylor Swift
40
Charli xcx
12
Miley Cyrus
11
Mother Mother
10
Beyoncé
9
Chappell Roan
9
Måneskin
9
Rigoberta Bandini
8
Amaral
7
Bizarrap
7
Saved in playlist or library
Code
# Combine tracks from playlists and library saved_tracks <-unique(c(playlist_top_tracks$track_name, library_tracks$track_name))# Combine artists from playlists and librarysaved_artist <-unique(c(playlist_top_artist$artist_name, library_top_artist$artist_name))# Plot proportion of tracks in playlists by yearplot_saved_track <- stream %>%mutate(`In my playlists`=ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +theme_minimal() +ggtitle("Were the tracks (streaming history) in the current playlists?") +ylab("")# Display plotplot_saved_track
Code
# Save plotif(save_svg) ggsave(paste0("plot_saved_track",".svg"), path ="output_files/")
Code
# Plot proportion of artists in playlists by yearplot_saved_artist <- stream %>%mutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%"))+theme_minimal() +ggtitle("Were the artists (streaming history) in the current playlists?")+ylab("")plot_saved_artist
Code
# Save plotif(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path ="output_files/")
Source Code
---title: "Spotify Stream History Analyis"author: "Natalia Ciria"date: last-modifiedformat: html: embed-resources: true code-tools: true code-fold: true code-block-border-left: trueexecute: warning: false message: false---## Set-up```{r}# Set-up parametersthreshold_year<-2019# Exclude older years from some anlaysistop_artist<-"Taylor Swift"# Exclude one artist from some analysissave_csv <-TRUE# Export stream data frame as csvsave_svg <-TRUE# Export ggplots as svg files# Required librarieslibrary(jsonlite) # Working with JSON datalibrary(dplyr) # Data transformationlibrary(tidyr) # Data cleaninglibrary(lubridate) # Handling dates and timeslibrary(ggplot2) # Plot graphslibrary(knitr) # Report formattingif(save_svg) library(svglite) # Create SVG files# Colour palettepal<-c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")```## Data preparation```{r}# Find json streaming history filesstream_files<-list.files("input_files/", pattern="Streaming_History_Audio")#Merge all streaming history files into a data frame#Read and parse playlist data from the fist JSON filestream<-fromJSON(paste0("input_files/",stream_files[1]), flatten =TRUE)#loop allfor(i in2:length(stream_files)){ stream_i <-fromJSON(paste0("input_files/",stream_files[i]), flatten =TRUE) stream<-merge(stream, stream_i, all =TRUE)}``````{r}# Process Spotify streaming datastream <- stream %>%mutate(# Convert timestamp and rename metadata columnsdate =as_datetime(ts, tz ="UTC"),artist_name = master_metadata_album_artist_name,track_name = master_metadata_track_name,album_name = master_metadata_album_album_name,# Categorize device typesDevice =case_when(grepl("Android-tablet|Android OS|android|android_tv", platform) ~"Phone",grepl("public_js|web_player|WebPlayer|chrome|Windows 10|windows", platform) ~"Computer", ),# Categorize track end reasonsEnd =case_when( reason_end %in%c("trackdone", "endplay") ~"Track finished", reason_end =="logout"~"Spotify closed", reason_end =="playbtn"~"Play button", reason_end =="fwdbtn"~"Forward button", reason_end =="backbtn"~"Backward button",.default ="Other" ),# Categorize track start reasonsStart =case_when( reason_start %in%c("clickrow", "click-row") ~"Selected", reason_start =="trackdone"~"Track finished", reason_start =="persisted"~"Persisted", reason_start =="playbtn"~"Play button", reason_start =="fwdbtn"~"Forward button", reason_start =="backbtn"~"Backward button",.default ="Other" ) )# Save processed data if save_csv is TRUEif(save_csv) write.csv(stream, "output_files/stream.csv")```## Annual Spotify hours### Total```{r}# Calculate total listening hours by yearby_year <- stream %>%group_by(year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60,n_tracks=n(),n_songs=n_distinct(n_tracks),n_artists=n_distinct(artist_name)) # Display tableby_year%>%transmute(Year=year,`Hours played`=round(h_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`N artists`= n_artists )%>%kable(caption="Spotify listened by year")```### By device```{r}# Calculate total listening hours by year and deviceby_device <- stream %>%group_by(Device, year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60) # Create stacked bar chart of listening hours by device and yearplot_device<-by_device%>%ggplot(aes(x = year, y = h_played, fill = Device)) +geom_col(position =position_stack(reverse =TRUE)) +# Stack bars with reverse orderscale_fill_manual(values = pal, na.value ="grey80") +# Custom color palettetheme_minimal() +ggtitle("Hours listening to Spotify") +ylab("")# Display plotplot_device# Save plotif(save_svg) ggsave(paste0("plot_device",".svg"), path ="output_files/")```## Shuffle mode```{r}# Calculate tracks played in shuffle mode by yearby_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date)), shuffle) %>%summarise(n =n()) %>%mutate(`Proportion of tracks`= n /sum(n),Shuffle =ifelse(shuffle, "Yes", "No"))# Create stacked bar chart of tracks played in shuffle mode by yearplot_shuffle<-by_shuffle%>%ggplot(aes(x = year, y =`Proportion of tracks`, fill=Shuffle)) +geom_col() +scale_fill_manual(values = pal, na.value ="grey80") +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +# Convert y-axis to percentagesggtitle("Proportion of tracks listened in shuffle mode") +ylab("")# Display plotplot_shuffle# Save plotif(save_svg) ggsave(paste0("plot_shuffle",".svg"), path ="output_files/")```## Track start and end```{r}# Create consistent palettes for track start and end visualization# Sort unique end and start valuesnames_end <-sort(unique(stream$End), decreasing =TRUE)names_start <-sort(unique(stream$Start), decreasing =TRUE)# Create palette for end valuespal_end <- pal[1:length(names_end)]names(pal_end) <- names_end# Find overlapping colors between start and endpal_start_end <- pal_end[names_end %in% names_start]# Create palette for start valuespal_start <- pal[!pal %in% pal_start_end]names(pal_start) <- names_start[!names_start %in% names_end]# Combine palettespal_start <-c(pal_start_end, pal_start)``````{r}# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)by_reason_start_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`Start`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))plot_reason_start_shuffle<-by_reason_start_shuffle%>%ggplot(aes(x = year, y = n, fill=`Start`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_start) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks started") +ylab("Proportion of tracks") +theme(legend.title=element_blank()) # Display plotplot_reason_start_shuffle# Save plotif(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path ="output_files/")``````{r}# Calculate number of tracks listenede by year and suffle modeby_reason_end_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`End`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n)) # Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)plot_reason_end_shuffle<-by_reason_end_shuffle%>%ggplot(aes(x = year, y = n, fill=`End`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_end) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks ended") +ylab("Proportion of tracks") +theme(legend.title=element_blank()) # Display plotplot_reason_end_shuffle# Save plotif(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path ="output_files/")```## Listening peaks### Hours per day calendar```{r}#| fig.height: 7.6# Create a dataframe with all days in the date rangeall_days <-data.frame(day_date =seq(as.Date(floor_date(min(stream$date), unit="year")),as.Date(ceiling_date(max(stream$date), unit="year")-1), by="days")) %>%mutate(day_n =row_number())# Calculate first weekday of each yearfirst_week_day <- all_days %>%mutate(weekday =as.POSIXlt(day_date)$wday,weekday =ifelse(weekday==0, 7, weekday),year =year(day_date)) %>%group_by(year) %>%summarize(first_week_day =first(weekday)-1,days_year =n())# Process streaming data by dayby_day_year <- stream %>%mutate(day_date =date(stream$date)) %>%right_join(all_days) %>%group_by(day_date, day_n) %>%# Calculate daily metricssummarise(h_played =sum(ms_played)/1000/60/60, # Convert ms to hoursn_tracks =n(),computer =mean(Device=="Computer", na.rm=TRUE),shuffle =mean(shuffle, na.rm=TRUE),incognito_mode =mean(incognito_mode, na.rm=TRUE),offline =mean(offline, na.rm=TRUE),h_top_artist =sum(ifelse(grepl(top_artist,artist_name), h_played, 0), na.rm=TRUE) ) %>%# Add calendar columnsmutate(h_played =ifelse(is.na(h_played), 0, h_played),year =year(day_date),month =month(day_date),week =week(day_date),day_year =yday(day_date),weekday =as.POSIXlt(day_date)$wday,weekday =factor(ifelse(weekday==0, 7, weekday), levels=c(1:7)) ) %>%left_join(first_week_day) %>%mutate(calendar_row =ceiling((first_week_day + day_year)/7) )# Create calendar heatmapplot_day_year<-by_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=h_played)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +ggtitle("Hours listened per day") +theme(legend.title=element_blank()) +ylab("")# Display plotplot_day_year# Save plot if save_svg is TRUEif(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")```### Other calendars```{r, output=FALSE}# Other calendars - not included in the automatic report# Plot for computer usageby_day_year %>% ggplot(aes(x = weekday, y = -calendar_row, fill=computer)) + geom_tile() + facet_grid(cols=vars(year), rows=vars(month), scales = "free") + theme_void() + scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") + ggtitle("% of tracks listened on the computer per day") + ylab("")# Plot for incognito mode usageby_day_year %>% ggplot(aes(x = weekday, y = -calendar_row, fill=incognito_mode)) + geom_tile() + facet_grid(cols=vars(year), rows=vars(month), scales = "free") + theme_void() + scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") + ggtitle("% of tracks listened in incognito mode") + ylab("")# Plot for offline modeby_day_year %>% ggplot(aes(x = weekday, y = -calendar_row, fill=offline)) + geom_tile() + facet_grid(cols=vars(year), rows=vars(month), scales = "free") + theme_void() + scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") + ggtitle("% of tracks listened offline") + ylab("")# Plot for top artist listening patternsby_day_year %>% ggplot(aes(x = weekday, y = -calendar_row, fill=h_top_artist)) + geom_tile() + facet_grid(cols=vars(year), rows=vars(month), scales = "free") + theme_void() + scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") + ggtitle(paste0("Hours listened to ",top_artist," per day")) + ylab("")```### Top days```{r}# Create summary of number of times a track was listened in a dayby_top_track_day <- stream %>%mutate(day_date =date(stream$date)) %>%group_by(track_name, day_date) %>%summarise(n_tracks =n())%>%ungroup() %>%arrange(desc(n_tracks))# Get top 10 days by hours playedby_top_day <- stream %>%mutate(day_date =date(stream$date)) %>%group_by(day_date) %>%summarise(n_tracks_all =n(),computer =mean(Device =="Computer"),shuffle =mean(shuffle),h_played =sum(ms_played)/1000/60/60 ) %>%ungroup() %>%arrange(desc(h_played)) %>%slice_max(h_played, n =10)# Join the data and calculate final metricsby_top_day <-by_top_day %>%left_join(by_top_track_day) %>%group_by(day_date) %>%summarize(mult_times =mean(n_tracks >1)) %>%left_join(by_top_day) %>%arrange(desc(h_played)) %>%relocate(day_date, h_played, n_tracks = n_tracks_all)# Display tableby_top_day%>%transmute(Date=day_date,Hours=round(h_played,2),`N tracks`=n_tracks,`% tracks replayed`=paste0(round(mult_times,4)*100, "%"),`% tracks on the computer`=paste0(round(computer,4)*100, "%"),`% tracks on shuffle`=paste0(round(computer,4)*100, "%"))%>%kable(caption="Top 10 days by listening hours")```### Minutes listened by hour of the day```{r}# Create a dataframe with all possible hours for each dayall_hours <- all_days %>%expand(day_date, hour=1:24)# Calculate average listening time by hour and yearby_hour_year <- stream %>%mutate(day_date =date(stream$date),hour =hour(date)) %>%right_join(all_hours) %>%group_by(day_date, hour) %>%mutate(ms_played =ifelse(is.na(ms_played), 0, ms_played)) %>%summarise(min_played =sum(ms_played/1000/60)) %>%group_by(year =year(day_date), hour) %>%summarise(min_played =mean(min_played)) # Create visualization of average listening time by hour and yearplot_hour_year <- by_hour_year %>%ggplot(aes(x = year, y = hour, fill = min_played)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(hour), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Minutes listened per hour of the day (on average)") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank())# Display plotplot_hour_year# Save plotif(save_svg) ggsave(paste0("plot_hour_year",".svg"), width =7, height =5, path ="output_files/")```## What have I listened to in Spotify?### Top artists```{r}# Calculate statistics by yearby_year <- stream %>%group_by(year =year(date)) %>%summarise(min_played_year =sum(ms_played)/1000/60,n_tracks_year =n(),n_songs_year =n_distinct(track_name) )# Calculate statistics by artist and yearby_artist_year <- stream %>%group_by(artist_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n(),n_songs =n_distinct(track_name),p_tracks =n()/sum(n_tracks) ) %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name, ", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%left_join(by_year) %>%filter(!is.na(artist_name)) %>%group_by(artist_name, year) %>%summarise(min_played =sum(min_played),n_tracks =sum(n_tracks),n_songs =sum(n_songs),p_tracks = n_tracks/n_tracks_year,p_min_played = min_played/min_played_year,p_songs = n_songs/n_songs_year ) %>%arrange(year,desc(n_tracks))by_artist_year %>%group_by(Year = year) %>%slice_max(n_tracks, n =3)%>%transmute(Artist = artist_name,`Minutes played`=round(min_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`% of the annual time`=paste0(round(p_min_played,4)*100, "%"),`% of the annual tracks`=paste0(round(p_tracks,4)*100, "%"),`% of the annual songs`=paste0(round(p_songs,4)*100, "%"), )%>%kable(caption ="Top 3 artists by year")# Get top 40 artists by yeartop_artist_year <- by_artist_year %>%ungroup() %>%slice_max(n_tracks, n =40) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played),n_songs_all=sum(n_songs)) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%distinct()# Calculate statistics by artistby_artist_all <- stream %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name, ", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%group_by(artist_name) %>%summarise(min_played_all =sum(ms_played/1000/60),n_tracks_all =n(),n_songs_all =n_distinct(track_name),p_tracks_all =n()/sum(n()) ) %>%filter(!is.na(artist_name)) %>%arrange(desc(n_tracks_all))# Get top 40 artists overalltop_artist_all<-by_artist_all %>%slice_max(n_tracks_all, n =40) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all)# Display tabletop_artist_all[1:10,]%>%transmute(Artist=artist_name,`Minutes played`=round(min_played_all,2),`N tracks`= n_tracks_all,`N songs`= n_songs_all)%>%kable(caption ="Top 10 artists (all years)")# Get top 40 artists overall or by yeartop_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))# Join top artists data and filter top artistsby_top_artist <- by_artist_year %>%filter(artist_name%in%top_artist_global) %>%left_join(by_artist_all) %>%arrange(desc(n_tracks)) ```#### Global - excluding 1 artist```{r}#| fig.height: 9# Create heatmap excluding top artistplot_top_artist_year_filter<-by_top_artist%>%filter(!artist_name == top_artist) %>%ggplot(aes(x = year, y =reorder(artist_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(artist_name, -n_tracks_all)), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle(paste0("Times listened to most played artists (excluding ", top_artist,")")) +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank() )# Display plotplot_top_artist_year_filter# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year_filter",".svg"), width =7, height =9, path ="output_files/")```#### Global```{r}#| fig.height: 9# Create a heatmap (not filtering top_artist)plot_top_artist_year<-by_top_artist%>%ggplot(aes(x = year, y =reorder(artist_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(artist_name, -n_tracks_all)), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank())# Display plotplot_top_artist_year# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width =7, height =9, path ="output_files/")```#### Global + Top 10 by year```{r}# Get top 10 artists for each yeartop_artist_year_partial <- by_artist_year %>%group_by(year) %>%slice_max(n_tracks, n =10) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%select(artist_name) %>%distinct()# Combine different top artist liststop_artist_all <- by_artist_year %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%slice_max(n_tracks_all, n =40) %>%select(artist_name, n_tracks_all) %>%bind_rows(top_artist_year) %>%bind_rows(top_artist_year_partial) %>%select(artist_name) %>%distinct()``````{r}#| fig.height: 15# Prepare data for matrix visualizationby_top_artist_matrix <- by_artist_year %>%right_join(top_artist_all) %>%select(artist_name, year, p_tracks) %>%arrange(year, desc(p_tracks))# Create wide format matrix # (this is redundant, but I was stuck geting the artist order right)matrix <- tidyr::pivot_wider(data = by_top_artist_matrix,id_cols = artist_name,names_from = year,values_from = p_tracks)# Set ordering for visualizationrow_order <- matrix$artist_namecol_order <-names(matrix)[-1]# Create heatmap visualizationplot_top_artist_matrix <- by_top_artist_matrix %>%mutate(n_row =row_number(),year =factor(year, levels = col_order),artist_name =factor(artist_name, levels = row_order) ) %>%filter(!artist_name %in% top_artist) %>%ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +geom_tile(position ="identity") +theme_void() +facet_grid(cols =vars(year), rows =vars(artist_name), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_top_artist_matrix# Save plotif(save_svg) ggsave(paste0("plot_top_artist_matrix", ".svg"), width =8, height =15, path ="output_files/")```### Top tracks#### Most listened```{r}# Calculate total play time and count for each tracktop_track <- stream %>%group_by(track_name, artist_name) %>%summarise(min_played =sum(ms_played/1000/60), # Convert ms to minutesn_tracks =n() ) %>%arrange(desc(n_tracks))%>%ungroup()# Display tabletop_track[1:10,]%>%transmute(`Track`= track_name,`Artist`= artist_name,`Minutes played`=round(min_played,2),`N tracks`=`n_tracks` )%>%kable(caption="Top 10 tracks")``````{r}#| fig.height: 10# Calculate statistics by track and yearby_track_year <- stream %>%group_by(track_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n() )# Get top 30 tracks by yeartop_track_year <- by_track_year %>%ungroup() %>%slice_max(n_tracks, n =30) %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%select(track_name, n_tracks_all, min_played_all) %>%distinct()# Get top 30 tracks overalltop_track_all <- by_track_year %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%slice_max(n_tracks_all, n =30) %>%select(track_name, n_tracks_all, min_played_all) %>%bind_rows(top_track_year) %>%distinct()# Create heatmap visualizationplot_track_year<-by_track_year %>%right_join(top_track_all) %>%arrange(n_tracks) %>%ggplot(aes(x = year, y =reorder(track_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(reorder(track_name, -n_tracks_all)), scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played tracks") +theme(legend.title =element_blank(),axis.text.x =element_blank(), axis.ticks.x =element_blank() )# Display plotplot_track_year# Save plotif(save_svg) ggsave(paste0("plot_track_year",".svg"), width =7, height =9, path ="output_files/")```#### Manually selected tracks```{r}# Filter and summarize selected tracksby_track_select <- stream %>%# Get only selected tracks that have a namefilter(Start=="Selected", !is.na(track_name)) %>%group_by(track_name, artist_name) %>%summarise(n_selected =n() ) %>%ungroup() %>%# Join with top tracks dataleft_join(top_track) %>%# Calculate percentage of times track was selectedmutate(p_selected = n_selected/n_tracks ) %>%# Get top 20 most selected tracksslice_max(n_selected, n =20) %>%arrange(desc(n_selected))# Display tableby_track_select %>%transmute(`Track`= track_name,`Artist`= artist_name,`Times Selected`= n_selected,`% Selected`=paste0(round(p_selected,4)*100,"%"),`N tracks`=`n_tracks` ) %>%kable(caption="Top selected tracks")```#### In loopTimes played in one day```{r}# Display tableby_top_track_day[1:10,]%>%transmute(Date=day_date,Track=track_name,`N tracks`=n_tracks)%>%kable(caption="Top 10 songs by times played in one day")``````{r}# Find tracks that were played more than 5 times in a day# and count how many days this happenedby_top_track_loop<-by_top_track_day%>%filter(n_tracks>5)%>%group_by(track_name)%>%summarise(n_days=n() )%>%ungroup() %>%arrange(desc(n_days))%>%filter(n_days>2,!is.na(track_name))# Display tableby_top_track_loop%>%transmute(`Track`=track_name,`Days in loop (times played>5)`=n_days)%>%kable(caption="Tracks played more than 5 times several days")```### Guilty pleasures (incognito mode)#### Per year```{r}# Create a plot showing proportion of tracks listened in incognito mode by yearplot_incognito <- stream %>%group_by(year =as.factor(year(date))) %>%summarise(`Proportion of tracks`=mean(incognito_mode)) %>%ggplot(aes(x = year, y =`Proportion of tracks`)) +geom_col(fill=pal[1]) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("Proportion of tracks listened in incognito mode") +ylab("")# Display plotplot_incognito# Save plotif(save_svg) ggsave(paste0("plot_incognito",".svg"), path ="output_files/")```#### Top incognito tracks```{r, include=FALSE}# Get tracks most frequently listened to in incognito mode# This was trickier than spected, I ended up:# - Filtering for tracks played in incognito mode at least 5 times# - Calculating an score as (proportion in incognito) * (number of incognito plays)# - Slicing top 10 tracks by scoreby_incognito_track <- stream %>% group_by(track_name, artist_name) %>% summarise( p_incognito = mean(incognito_mode, na.rm = TRUE), n_incognito = sum(incognito_mode, na.rm = TRUE), score = p_incognito * n_incognito, n_tracks=n(), ) %>% ungroup() %>% filter(n_incognito > 5) %>% slice_max(score, n = 10)# Display tableby_incognito_track %>% transmute( `Track` = track_name, `Artist` = artist_name, `Times in incognito` = n_incognito, `% Incognito` = paste0(round(p_incognito,4)*100,"%"), `Incognito score` = round(score,2), `N tracks`= `n_tracks` ) %>% kable(caption="Top tracks in incognito mode")```## Playlists and Library tracks### Set up```{r}# Read and parse playlist data from JSON fileplaylist <-fromJSON("input_files/Playlist1.json", flatten =TRUE)[[1]]playlist_tracks <-data.frame()# Loop through each playlist and extract tracksfor(i in1:nrow(playlist)){ playlist_tracks_i <- playlist$items[[i]] playlist_tracks_i$playlist_name <- playlist$name[i] playlist_tracks <-bind_rows(playlist_tracks, playlist_tracks_i)}# Read library data from JSON filelibrary <-fromJSON("input_files/YourLibrary.json", flatten =TRUE)[[1]]```### User playlists```{r}# Process playlist tracks dataplaylist_tracks <- playlist_tracks %>%mutate(added_date =as_datetime(addedDate, tz ="UTC"),artist_name = track.artistName,track_name = track.trackName,album_name = track.albumName )# Get most common tracks in playlistsplaylist_top_tracks <- playlist_tracks %>%group_by(track_name,artist_name) %>%summarise(n_playlists =n()) %>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_tracks[1:10,]%>%transmute(`Track`=track_name,`Artist`= artist_name,`N playlists`= n_playlists, )%>%kable(caption="Tracks included in more playlists")``````{r}# Get most common artists in playlistsplaylist_top_artist<-playlist_tracks%>%group_by(artist_name)%>%summarise(n_tracks=n(),n_songs=n_distinct(track_name),n_playlists=n_distinct(playlist_name))%>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N tracks`= n_tracks,`N songs`= n_songs,`N playlists`= n_playlists, )%>%kable(caption="Artists included in more playlists")```### User library```{r}# Get most common artists in library (saved songs)library_tracks<-library%>%mutate(artist_name = artist,track_name = track,album_name = album)library_top_artist<-library_tracks%>%group_by(artist_name)%>%summarise(n_songs=n_distinct(track_name))%>%arrange(desc(n_songs))%>%ungroup()# Display tablelibrary_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N songs`= n_songs, )%>%kable(caption="Artists with more songs in library")```### Saved in playlist or library```{r}# Combine tracks from playlists and library saved_tracks <-unique(c(playlist_top_tracks$track_name, library_tracks$track_name))# Combine artists from playlists and librarysaved_artist <-unique(c(playlist_top_artist$artist_name, library_top_artist$artist_name))# Plot proportion of tracks in playlists by yearplot_saved_track <- stream %>%mutate(`In my playlists`=ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +theme_minimal() +ggtitle("Were the tracks (streaming history) in the current playlists?") +ylab("")# Display plotplot_saved_track# Save plotif(save_svg) ggsave(paste0("plot_saved_track",".svg"), path ="output_files/")``````{r}# Plot proportion of artists in playlists by yearplot_saved_artist <- stream %>%mutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%"))+theme_minimal() +ggtitle("Were the artists (streaming history) in the current playlists?")+ylab("")plot_saved_artist# Save plotif(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path ="output_files/")```