Spotify Stream History Analyis

Spotify Stream History Analyis

Author

Natalia Ciria

Published

December 29, 2024

Set-up

Code
# Set-up parameters
threshold_year<-2019 # Exclude older years from some anlaysis
top_artist<- "Taylor Swift" # Exclude one artist from some analysis
save_csv <- TRUE # Export stream data frame as csv
save_svg <- TRUE # Export ggplots as svg files

# Required libraries
library(jsonlite) # Working with JSON data
library(dplyr) # Data transformation
library(tidyr) # Data cleaning
library(lubridate) # Handling dates and times
library(ggplot2) # Plot graphs
library(knitr) # Report formatting
if(save_svg) library(svglite) # Create SVG files

# Colour palette
pal<- c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")

Data preparation

Code
# Find json streaming history files
stream_files<-list.files("input_files/", pattern="Streaming_History_Audio")

#Merge all streaming history files into a data frame
#Read and parse playlist data from the fist JSON file
stream<-fromJSON(paste0("input_files/",stream_files[1]), flatten = TRUE)
#loop all
for(i in 2:length(stream_files)){
  stream_i <- fromJSON(paste0("input_files/",stream_files[i]), flatten = TRUE)
  stream<-merge(stream, stream_i, all = TRUE)
}
Code
# Process Spotify streaming data
stream <- stream %>% 
  mutate(
    # Convert timestamp and rename metadata columns
    date = as_datetime(ts, tz = "UTC"),
    artist_name = master_metadata_album_artist_name,
    track_name = master_metadata_track_name,
    album_name = master_metadata_album_album_name,
    
    # Categorize device types
    Device = case_when(
      grepl("Android-tablet|Android OS|android|android_tv", platform) ~ "Phone",
      grepl("public_js|web_player|WebPlayer|chrome|Windows 10|windows", platform) ~ "Computer",
    ),
    
    # Categorize track end reasons
    End = case_when(
      reason_end %in% c("trackdone", "endplay") ~ "Track finished",
      reason_end == "logout" ~ "Spotify closed",
      reason_end == "playbtn" ~ "Play button", 
      reason_end == "fwdbtn" ~ "Forward button",
      reason_end == "backbtn" ~ "Backward button",
      .default = "Other"
    ),
    
    # Categorize track start reasons
    Start = case_when(
      reason_start %in% c("clickrow", "click-row") ~ "Selected",
      reason_start == "trackdone" ~ "Track finished",
      reason_start == "persisted" ~ "Persisted",
      reason_start == "playbtn" ~ "Play button",
      reason_start == "fwdbtn" ~ "Forward button", 
      reason_start == "backbtn" ~ "Backward button",
      .default = "Other"
    )
  )

# Save processed data if save_csv is TRUE
if(save_csv) write.csv(stream, "output_files/stream.csv")

Annual Spotify hours

Total

Code
# Calculate total listening hours by year
by_year <- stream %>%
  group_by(year = as.factor(year(date))) %>%
  summarise(h_played = sum(ms_played)/1000/60/60,
            n_tracks=n(),
            n_songs=n_distinct(n_tracks),
            n_artists=n_distinct(artist_name)) 

# Display table
by_year%>%
  transmute(
    Year=year,
    `Hours played`=round(h_played,2),
    `N tracks`= n_tracks,
    `N songs` = n_songs,
    `N artists` = n_artists
  )%>%
kable(caption="Spotify listened by year")
Spotify listened by year
Year Hours played N tracks N songs N artists
2014 6.20 736 1 455
2015 32.38 3682 1 1576
2016 81.11 2717 1 1218
2017 212.79 4882 1 1338
2018 363.37 9056 1 1820
2019 381.43 7731 1 1564
2020 670.41 13860 1 2519
2021 838.28 19114 1 2530
2022 808.95 24118 1 3197
2023 1161.05 29850 1 2690
2024 1141.30 31826 1 3102

By device

Code
# Calculate total listening hours by year and device
by_device <- stream %>%
  group_by(Device, year = as.factor(year(date))) %>%
  summarise(h_played = sum(ms_played)/1000/60/60) 

# Create stacked bar chart of listening hours by device and year
plot_device<-by_device%>%
  ggplot(aes(x = year, y = h_played, fill = Device)) +
  geom_col(position = position_stack(reverse = TRUE)) +  # Stack bars with reverse order
  scale_fill_manual(values = pal, na.value = "grey80") + # Custom color palette
  theme_minimal() +
  ggtitle("Hours listening to Spotify") +
  ylab("")

# Display plot
plot_device

Code
# Save plot
if(save_svg) ggsave(paste0("plot_device",".svg"), path = "output_files/")

Shuffle mode

Code
# Calculate tracks played in shuffle mode by year
by_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>% # Filter data from threshold year onwards
  group_by(year = as.factor(year(date)), shuffle) %>%
  summarise(n = n()) %>%
  mutate(`Proportion of tracks` = n / sum(n),
         Shuffle = ifelse(shuffle, "Yes", "No"))

# Create stacked bar chart of tracks played in shuffle mode by year
plot_shuffle<-by_shuffle%>%
  ggplot(aes(x = year, y = `Proportion of tracks`, fill=Shuffle)) +
  geom_col() +
  scale_fill_manual(values = pal, na.value = "grey80") +
  theme_minimal() +
  
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) + # Convert y-axis to percentages
  ggtitle("Proportion of tracks listened in shuffle mode") +
  ylab("")

# Display plot
plot_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_shuffle",".svg"), path = "output_files/")

Track start and end

Code
# Create consistent palettes for track start and end visualization
# Sort unique end and start values
names_end <- sort(unique(stream$End), decreasing = TRUE)
names_start <- sort(unique(stream$Start), decreasing = TRUE)

# Create palette for end values
pal_end <- pal[1:length(names_end)]
names(pal_end) <- names_end

# Find overlapping colors between start and end
pal_start_end <- pal_end[names_end %in% names_start]

# Create palette for start values
pal_start <- pal[!pal %in% pal_start_end]
names(pal_start) <- names_start[!names_start %in% names_end]

# Combine palettes
pal_start <- c(pal_start_end, pal_start)
Code
# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)
by_reason_start_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>%
  mutate(
    Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%
  group_by(year = as.factor(year(date)),`Start`, Shuffle) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(desc(n))

plot_reason_start_shuffle<-by_reason_start_shuffle%>%
  ggplot(aes(x = year, y = n, fill=`Start`)) +
  geom_col(position="fill") +                          
  facet_grid(cols=vars(Shuffle)) +  # Split by shuffle mode
  scale_fill_manual(values = pal_start) + 
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) + 
  ggtitle("How tracks started") +
  ylab("Proportion of tracks") +
  theme(legend.title=element_blank()) 

# Display plot
plot_reason_start_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path = "output_files/")
Code
# Calculate number of tracks listenede by year and suffle mode
by_reason_end_shuffle <- stream %>%
  filter(year(date)>=threshold_year) %>%
  mutate(
    Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%
  group_by(year = as.factor(year(date)),`End`, Shuffle) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(desc(n)) 

# Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)
plot_reason_end_shuffle<-by_reason_end_shuffle%>%
  ggplot(aes(x = year, y = n, fill=`End`)) +
  geom_col(position="fill") +                          
  facet_grid(cols=vars(Shuffle)) +  # Split by shuffle mode
  scale_fill_manual(values = pal_end) + 
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) + 
  ggtitle("How tracks ended") +
  ylab("Proportion of tracks") +
  theme(legend.title=element_blank()) 

# Display plot
plot_reason_end_shuffle

Code
# Save plot
if(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path = "output_files/")

Listening peaks

Hours per day calendar

Code
# Create a dataframe with all days in the date range
all_days <- data.frame(
  day_date = seq(as.Date(floor_date(min(stream$date), unit="year")),
                as.Date(ceiling_date(max(stream$date), unit="year")-1), 
                by="days")) %>%
  mutate(day_n = row_number())

# Calculate first weekday of each year
first_week_day <- all_days %>%
  mutate(weekday = as.POSIXlt(day_date)$wday,
         weekday = ifelse(weekday==0, 7, weekday),
         year = year(day_date)) %>%
  group_by(year) %>%
  summarize(first_week_day = first(weekday)-1,
            days_year = n())

# Process streaming data by day
by_day_year <- stream %>%
  mutate(day_date = date(stream$date)) %>%
  right_join(all_days) %>%
  group_by(day_date, day_n) %>%
  # Calculate daily metrics
  summarise(
    h_played = sum(ms_played)/1000/60/60,  # Convert ms to hours
    n_tracks = n(),
    computer = mean(Device=="Computer", na.rm=TRUE),
    shuffle = mean(shuffle, na.rm=TRUE),
    incognito_mode = mean(incognito_mode, na.rm=TRUE),
    offline = mean(offline, na.rm=TRUE),
    h_top_artist = sum(ifelse(grepl(top_artist,artist_name), h_played, 0), na.rm=TRUE)
  ) %>%
  # Add calendar columns
  mutate(
    h_played = ifelse(is.na(h_played), 0, h_played),
    year = year(day_date),
    month = month(day_date),
    week = week(day_date),
    day_year = yday(day_date),
    weekday = as.POSIXlt(day_date)$wday,
    weekday = factor(ifelse(weekday==0, 7, weekday), levels=c(1:7))
  ) %>%
  left_join(first_week_day) %>%
  mutate(
    calendar_row = ceiling((first_week_day + day_year)/7)
  )

# Create calendar heatmap
plot_day_year<-by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=h_played)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +
  ggtitle("Hours listened per day") +
  theme(legend.title=element_blank()) +
  ylab("")

# Display plot
plot_day_year

Code
# Save plot if save_svg is TRUE
if(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")

Other calendars

Code
# Other calendars - not included in the automatic report

# Plot for computer usage
by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=computer)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +
  ggtitle("% of tracks listened on the computer per day") +
  ylab("")
Code
# Plot for incognito mode usage
by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=incognito_mode)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +
  ggtitle("% of tracks listened in incognito mode") +
  ylab("")
Code
# Plot for offline mode
by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=offline)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +
  ggtitle("% of tracks listened offline") +
  ylab("")
Code
# Plot for top artist listening patterns
by_day_year %>%
  ggplot(aes(x = weekday, y = -calendar_row, fill=h_top_artist)) +
  geom_tile() +
  facet_grid(cols=vars(year), rows=vars(month), scales = "free") +
  theme_void() +
  scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value="grey95") +
  ggtitle(paste0("Hours listened to ",top_artist," per day")) +
  ylab("")

Top days

Code
# Create summary of number of times a track was listened in a day
by_top_track_day <- stream %>%
  mutate(day_date = date(stream$date)) %>%
  group_by(track_name, day_date) %>%
  summarise(n_tracks = n())%>%
  ungroup() %>%
  arrange(desc(n_tracks))


# Get top 10 days by hours played
by_top_day <- stream %>%
  mutate(day_date = date(stream$date)) %>%
  group_by(day_date) %>%
  summarise(
    n_tracks_all = n(),
    computer = mean(Device == "Computer"),
    shuffle = mean(shuffle),
    h_played = sum(ms_played)/1000/60/60
  ) %>%
  ungroup() %>%
  arrange(desc(h_played)) %>%
  slice_max(h_played, n = 10)
  
# Join the data and calculate final metrics
by_top_day <-by_top_day %>%
  left_join(by_top_track_day) %>%
  group_by(day_date) %>%
  summarize(mult_times = mean(n_tracks > 1)) %>%
  left_join(by_top_day) %>%
  arrange(desc(h_played)) %>%
  relocate(day_date, h_played, n_tracks = n_tracks_all)

# Display table
by_top_day%>%
  transmute(
    Date=day_date,
    Hours=round(h_played,2),
    `N tracks`=n_tracks,
    `% tracks replayed`=paste0(round(mult_times,4)*100, "%"),
    `% tracks on the computer`=paste0(round(computer,4)*100, "%"),
    `% tracks on shuffle`=paste0(round(computer,4)*100, "%"))%>%
  kable(caption="Top 10 days by listening hours")
Top 10 days by listening hours
Date Hours N tracks % tracks replayed % tracks on the computer % tracks on shuffle
2021-03-11 15.59 223 33.94% 100% 100%
2019-02-03 13.76 276 40.1% 100% 100%
2021-03-23 12.27 199 8.74% 100% 100%
2021-01-03 11.83 264 38.1% 100% 100%
2019-01-06 11.68 223 49.65% 99.55% 99.55%
2020-06-15 11.59 209 4.52% 100% 100%
2021-10-09 11.43 285 4.01% 95.44% 95.44%
2019-06-23 11.43 191 37.78% 64.4% 64.4%
2024-09-19 11.26 279 2.96% 72.04% 72.04%
2024-10-30 11.05 227 8.02% 72.25% 72.25%

Minutes listened by hour of the day

Code
# Create a dataframe with all possible hours for each day
all_hours <- all_days %>% expand(day_date, hour=1:24)


# Calculate average listening time by hour and year
by_hour_year <- stream %>%
  mutate(day_date = date(stream$date),
         hour = hour(date)) %>%
  right_join(all_hours) %>% 
  group_by(day_date, hour) %>%
  mutate(ms_played = ifelse(is.na(ms_played), 0, ms_played)) %>%
  summarise(min_played = sum(ms_played/1000/60)) %>%
  group_by(year = year(day_date), hour) %>%
  summarise(min_played = mean(min_played)) 
  

# Create visualization of average listening time by hour and year
plot_hour_year <- by_hour_year %>%
  ggplot(aes(x = year, y = hour, fill = min_played)) +
  geom_tile() +
  theme_void() +
  facet_grid(cols = vars(year), rows = vars(hour), scales = "free") +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Minutes listened per hour of the day (on average)") +
  theme(legend.title = element_blank(),
        axis.text.x = element_blank(), 
        axis.ticks.x = element_blank())

# Display plot
plot_hour_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_hour_year",".svg"), width = 7, height = 5, path = "output_files/")

What have I listened to in Spotify?

Top artists

Code
# Calculate statistics by year
by_year <- stream %>%
  group_by(year = year(date)) %>%
  summarise(
    min_played_year = sum(ms_played)/1000/60,
    n_tracks_year = n(),
    n_songs_year = n_distinct(track_name)
  )

# Calculate statistics by artist and year
by_artist_year <- stream %>%
  group_by(artist_name, year = year(date)) %>%
  summarise(
    min_played = sum(ms_played/1000/60),
    n_tracks = n(),
    n_songs = n_distinct(track_name),
    p_tracks = n()/sum(n_tracks)
  ) %>%
  # Split multiple artists into separate rows
  separate_wider_delim(
    artist_name, 
    ", ",
    names = paste0("artist_name", 1:5),
    too_few = "align_start",
    too_many = "drop"
  ) %>%
  pivot_longer(
    cols = starts_with("artist_name"),
    names_to = "artist_name_n",
    values_to = "artist_name"
  ) %>%
  left_join(by_year) %>%
  filter(!is.na(artist_name)) %>%
  group_by(artist_name, year) %>%
  summarise(
    min_played = sum(min_played),
    n_tracks = sum(n_tracks),
    n_songs = sum(n_songs),
    p_tracks = n_tracks/n_tracks_year,
    p_min_played = min_played/min_played_year,
    p_songs = n_songs/n_songs_year
  ) %>%
  arrange(year,desc(n_tracks))

by_artist_year %>%
  group_by(Year = year) %>%
  slice_max(n_tracks, n = 3)%>%
  transmute(
    Artist = artist_name,
    `Minutes played`= round(min_played,2),
    `N tracks` = n_tracks,
    `N songs` = n_songs,
    `% of the annual time` = paste0(round(p_min_played,4)*100, "%"),
    `% of the annual tracks` = paste0(round(p_tracks,4)*100, "%"),
    `% of the annual songs` = paste0(round(p_songs,4)*100, "%"),
  )%>%
  kable(caption = "Top 3 artists by year")
Top 3 artists by year
Year Artist Minutes played N tracks N songs % of the annual time % of the annual tracks % of the annual songs
2014 Glee Cast 2.78 16 16 0.75% 2.17% 2.59%
2014 Queen 3.51 13 8 0.94% 1.77% 1.29%
2014 Katy Perry 3.47 9 9 0.93% 1.22% 1.46%
2014 Muse 3.08 9 5 0.83% 1.22% 0.81%
2015 Maroon 5 18.71 26 8 0.96% 0.71% 0.35%
2015 Adele 23.30 25 8 1.2% 0.68% 0.35%
2015 Berlin 1.39 25 23 0.07% 0.68% 1.01%
2016 Video Game Players 37.51 55 32 0.77% 2.02% 1.81%
2016 Rihanna 59.91 40 15 1.23% 1.47% 0.85%
2016 P!nk 122.44 39 21 2.52% 1.44% 1.19%
2017 Beyoncé 256.46 81 11 2.01% 1.66% 0.51%
2017 Katy Perry 201.12 71 12 1.58% 1.45% 0.56%
2017 Fifth Harmony 199.88 67 8 1.57% 1.37% 0.37%
2018 Nate Fifield 146.91 139 30 0.67% 1.53% 1%
2018 Britney Spears 273.43 120 7 1.25% 1.33% 0.23%
2018 Katy Perry 229.32 99 11 1.05% 1.09% 0.37%
2019 Taylor Swift 530.63 182 32 2.32% 2.35% 1.16%
2019 Katy Perry 239.93 72 16 1.05% 0.93% 0.58%
2019 Belinda Carlisle 281.75 70 3 1.23% 0.91% 0.11%
2020 Taylor Swift 1519.54 499 89 3.78% 3.6% 1.86%
2020 Lady Gaga 334.64 106 21 0.83% 0.76% 0.44%
2020 Katy Perry 307.91 100 11 0.77% 0.72% 0.23%
2021 Taylor Swift 2882.50 916 181 5.73% 4.79% 3.23%
2021 La Oreja de Van Gogh 1088.05 385 57 2.16% 2.01% 1.02%
2021 Lofi Fruits Music 594.88 355 146 1.18% 1.86% 2.61%
2022 Taylor Swift 3025.07 1140 175 6.23% 4.73% 2.88%
2022 La Oreja de Van Gogh 758.02 315 48 1.56% 1.31% 0.79%
2022 Britney Spears 561.11 234 16 1.16% 0.97% 0.26%
2023 Taylor Swift 5558.48 1974 213 7.98% 6.61% 3.71%
2023 Britney Spears 1197.98 446 30 1.72% 1.49% 0.52%
2023 Miley Cyrus 1116.70 415 35 1.6% 1.39% 0.61%
2024 Chappell Roan 1584.05 605 17 2.31% 1.9% 0.25%
2024 Taylor Swift 1469.42 605 112 2.15% 1.9% 1.65%
2024 Charli xcx 822.99 420 47 1.2% 1.32% 0.69%
Code
# Get top 40 artists by year
top_artist_year <- by_artist_year %>%
  ungroup() %>%
  slice_max(n_tracks, n = 40) %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks),
            min_played_all = sum(min_played),
            n_songs_all=sum(n_songs)) %>%
  select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%
  distinct()

# Calculate statistics by artist
by_artist_all <- stream %>%
  # Split multiple artists into separate rows
  separate_wider_delim(
    artist_name, 
    ", ",
    names = paste0("artist_name", 1:5),
    too_few = "align_start",
    too_many = "drop"
  ) %>%
  pivot_longer(
    cols = starts_with("artist_name"),
    names_to = "artist_name_n",
    values_to = "artist_name"
  ) %>%
  group_by(artist_name) %>%
  summarise(
    min_played_all = sum(ms_played/1000/60),
    n_tracks_all = n(),
    n_songs_all = n_distinct(track_name),
    p_tracks_all = n()/sum(n())
  ) %>%
  filter(!is.na(artist_name)) %>%
  arrange(desc(n_tracks_all))

# Get top 40 artists overall
top_artist_all<-by_artist_all %>%
  slice_max(n_tracks_all, n = 40) %>%
  select(artist_name,min_played_all, n_tracks_all, n_songs_all)


# Display table
top_artist_all[1:10,]%>%
  transmute(Artist=artist_name,
            `Minutes played`= round(min_played_all,2),
            `N tracks` = n_tracks_all,
            `N songs` = n_songs_all)%>%
  kable(caption = "Top 10 artists (all years)")
Top 10 artists (all years)
Artist Minutes played N tracks N songs
Taylor Swift 15132.69 5376 327
Britney Spears 3653.19 1422 54
Lady Gaga 3033.80 1148 60
La Oreja de Van Gogh 2934.66 1139 84
Miley Cyrus 2867.66 1103 60
Rihanna 2829.78 955 46
Beyoncé 2423.74 907 82
Katy Perry 2320.41 896 39
Shakira 1929.30 871 54
Charli xcx 1720.10 830 58
Code
# Get top 40 artists overall or by year
top_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))


# Join top artists data and filter top artists
by_top_artist <- by_artist_year %>%
  filter(artist_name%in%top_artist_global) %>%
  left_join(by_artist_all) %>%
  arrange(desc(n_tracks)) 

Global - excluding 1 artist

Code
# Create heatmap excluding top artist
plot_top_artist_year_filter<-by_top_artist%>%
  filter(!artist_name == top_artist) %>%
  ggplot(aes(x = year, y = reorder(artist_name, -n_tracks_all), fill = n_tracks)) +
  geom_tile() +
  theme_void() +
  facet_grid(
    cols = vars(year), 
    rows = vars(reorder(artist_name, -n_tracks_all)), 
    scales = "free"
  ) +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle(paste0("Times listened to most played artists (excluding ", top_artist,")")) +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank()
  )

# Display plot
plot_top_artist_year_filter

Code
# Save plot
if(save_svg) ggsave(paste0("plot_top_artist_year_filter",".svg"), width = 7, height = 9, path = "output_files/")

Global

Code
# Create a heatmap (not filtering top_artist)
plot_top_artist_year<-by_top_artist%>%
  ggplot(aes(x = year, 
             y = reorder(artist_name, -n_tracks_all), 
             fill = n_tracks)) +
  geom_tile() +
  theme_void() +
    facet_grid(cols = vars(year), 
             rows = vars(reorder(artist_name, -n_tracks_all)), 
             scales = "free") +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played artists") +
  theme(legend.title = element_blank(),
        axis.text.x = element_blank(), 
        axis.ticks.x = element_blank())

# Display plot
plot_top_artist_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width = 7, height = 9, path = "output_files/")

Global + Top 10 by year

Code
# Get top 10 artists for each year
top_artist_year_partial <- by_artist_year %>%
  group_by(year) %>%
  slice_max(n_tracks, n = 10) %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks)) %>%
  select(artist_name) %>%
  distinct()

# Combine different top artist lists
top_artist_all <- by_artist_year %>%
  group_by(artist_name) %>%
  summarise(n_tracks_all = sum(n_tracks)) %>%
  slice_max(n_tracks_all, n = 40) %>%
  select(artist_name, n_tracks_all) %>%
  bind_rows(top_artist_year) %>%
  bind_rows(top_artist_year_partial) %>%
  select(artist_name) %>%
  distinct()
Code
# Prepare data for matrix visualization
by_top_artist_matrix <- by_artist_year %>%
  right_join(top_artist_all) %>%
  select(artist_name, year, p_tracks) %>%
  arrange(year, desc(p_tracks))

# Create wide format matrix 
# (this is redundant, but I was stuck geting the artist order right)
matrix <- tidyr::pivot_wider(
  data = by_top_artist_matrix,
  id_cols = artist_name,
  names_from = year,
  values_from = p_tracks
)

# Set ordering for visualization
row_order <- matrix$artist_name
col_order <- names(matrix)[-1]

# Create heatmap visualization
plot_top_artist_matrix <- by_top_artist_matrix %>%
  mutate(
    n_row = row_number(),
    year = factor(year, levels = col_order),
    artist_name = factor(artist_name, levels = row_order)
  ) %>%
  filter(!artist_name %in% top_artist) %>%
  ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +
  geom_tile(position = "identity") +
  theme_void() +
  facet_grid(
    cols = vars(year), 
    rows = vars(artist_name), 
    scales = "free"
  ) +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played artists") +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

# Display plot
plot_top_artist_matrix

Code
# Save plot
if(save_svg) ggsave(paste0("plot_top_artist_matrix", ".svg"), width = 8, height = 15, path = "output_files/")

Top tracks

Most listened

Code
# Calculate total play time and count for each track
top_track <- stream %>%
  group_by(track_name, artist_name) %>%
  summarise(
    min_played = sum(ms_played/1000/60),  # Convert ms to minutes
    n_tracks = n()
  ) %>%
  arrange(desc(n_tracks))%>%
  ungroup()

# Display table
top_track[1:10,]%>%
  transmute(
    `Track` = track_name,
    `Artist` = artist_name,
    `Minutes played` = round(min_played,2),
    `N tracks` = `n_tracks`
  )%>%
  kable(caption="Top 10 tracks")
Top 10 tracks
Track Artist Minutes played N tracks
I Kissed A Girl Katy Perry 578.46 260
Party In The U.S.A. Miley Cyrus 647.46 260
Heaven Is A Place On Earth Belinda Carlisle 841.42 245
Toxic Britney Spears 584.73 244
S&M Rihanna 814.68 226
…Baby One More Time Britney Spears 522.48 194
Wannabe Spice Girls 373.32 183
TiK ToK Kesha 411.41 173
Red Wine Supernova Chappell Roan 430.19 164
If U Seek Amy Britney Spears 423.96 163
Code
# Calculate statistics by track and year
by_track_year <- stream %>%
  group_by(track_name, year = year(date)) %>%
  summarise(
    min_played = sum(ms_played/1000/60),
    n_tracks = n()
  )

# Get top 30 tracks by year
top_track_year <- by_track_year %>%
  ungroup() %>%
  slice_max(n_tracks, n = 30) %>%
  group_by(track_name) %>%
  summarise(
    n_tracks_all = sum(n_tracks),
    min_played_all = sum(min_played)
  ) %>%
  select(track_name, n_tracks_all, min_played_all) %>%
  distinct()

# Get top 30 tracks overall
top_track_all <- by_track_year %>%
  group_by(track_name) %>%
  summarise(
    n_tracks_all = sum(n_tracks),
    min_played_all = sum(min_played)
  ) %>%
  slice_max(n_tracks_all, n = 30) %>%
  select(track_name, n_tracks_all, min_played_all) %>%
  bind_rows(top_track_year) %>%
  distinct()


# Create heatmap visualization
plot_track_year<-by_track_year %>%
  right_join(top_track_all) %>%
  arrange(n_tracks) %>%
  ggplot(aes(x = year, y = reorder(track_name, -n_tracks_all), fill = n_tracks)) +
  geom_tile() +
  theme_void() +
  facet_grid(
    cols = vars(year), 
    rows = vars(reorder(track_name, -n_tracks_all)), 
    scales = "free"
  ) +
  scale_fill_gradientn(colors = c("grey95", pal[c(1,3,2)])) +
  ggtitle("Times listened to most played tracks") +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank()
  )

# Display plot
plot_track_year

Code
# Save plot
if(save_svg) ggsave(paste0("plot_track_year",".svg"), width = 7, height = 9, path = "output_files/")

Manually selected tracks

Code
# Filter and summarize selected tracks
by_track_select <- stream %>%
  # Get only selected tracks that have a name
  filter(Start=="Selected", !is.na(track_name)) %>%
  group_by(track_name, artist_name) %>%
  summarise(
    n_selected = n()
  ) %>%
  ungroup() %>%
  # Join with top tracks data
  left_join(top_track) %>%
  # Calculate percentage of times track was selected
  mutate(
    p_selected = n_selected/n_tracks
  ) %>%
  # Get top 20 most selected tracks
  slice_max(n_selected, n = 20) %>%
  arrange(desc(n_selected))

# Display table
by_track_select %>%
  transmute(
    `Track` = track_name,
    `Artist` = artist_name,
    `Times Selected` = n_selected,
    `% Selected` = paste0(round(p_selected,4)*100,"%"),
    `N tracks`= `n_tracks`
  ) %>%
  kable(caption="Top selected tracks")
Top selected tracks
Track Artist Times Selected % Selected N tracks
Tití Me Preguntó Bad Bunny 43 34.68% 124
Estrella Polar (with Juan Aguirre) Pereza 42 33.87% 124
No Sigue Modas Juan Magán 34 45.95% 74
Don’t Blame Me Taylor Swift 29 19.08% 152
Heaven Is A Place On Earth Belinda Carlisle 28 11.43% 245
Al Amanecer Los Fresones Rebeldes 27 22.69% 119
Lisa Young Miko 27 18.12% 149
Mi Reina Henry Mendez 27 21.09% 128
Nobody Like U 4*TOWN (From Disney and Pixar’s Turning Red) 27 45% 60
No Go Go Go! 89ers 25 34.25% 73
Red Wine Supernova Chappell Roan 25 15.24% 164
Right Round The Treblemakers 25 36.76% 68
Rosas Algo 25 21.37% 117
Sarà perché ti amo DJ Matrix 25 28.41% 88
Shape of You Ed Sheeran 25 29.76% 84
APRENDER A AMAR NATHY PELUSO 24 27.59% 87
Bad Karma (feat. Joan Jett) Miley Cyrus 24 28.57% 84
Boys Charli xcx 24 23.53% 102
Tick Tick Boom (feat. BygTwo3) Sage The Gemini 24 28.24% 85
Yo Solo Quiero Amor (From “Te Estoy Amando Locamente”) Rigoberta Bandini 24 31.17% 77

In loop

Times played in one day

Code
# Display table
by_top_track_day[1:10,]%>%
  transmute(Date=day_date,
            Track=track_name,
            `N tracks`=n_tracks)%>%
kable(caption="Top 10 songs by times played in one day")
Top 10 songs by times played in one day
Date Track N tracks
2024-06-05 La Ladrona - Remasterizado 2009 23
2024-03-29 Little Boxes 23
2023-08-10 Yo Solo Quiero Amor (From “Te Estoy Amando Locamente”) 23
2023-08-10 Hits 2000’ - Mashup 21
2024-04-14 Chicken Little 17
2023-10-26 Rikiti 17
2023-03-09 Aunque Pene 16
2023-05-15 Goodbye Earl 16
2021-10-11 Yo Invito 16
2024-05-22 APRENDER A AMAR 15
Code
# Find tracks that were played more than 5 times in a day
# and count how many days this happened
by_top_track_loop<-by_top_track_day%>%
  filter(n_tracks>5)%>%
  group_by(track_name)%>%
  summarise(
    n_days=n()
  )%>%
  ungroup() %>%
  arrange(desc(n_days))%>%
  filter(n_days>2,!is.na(track_name))

# Display table
by_top_track_loop%>%
  transmute(
    `Track`=track_name,
    `Days in loop (times played>5)`=n_days)%>%
  kable(caption="Tracks played more than 5 times several days")
Tracks played more than 5 times several days
Track Days in loop (times played>5)
“Hetero” 4
APRENDER A AMAR 3
Chicken Little 3
Dios Está Aquí (Perreo Mix) 3
Like a Prayer - Live 3
Me quedaré solo 3
Niños furbito y niñas lo que sea 3
Sarà perché ti amo 3
Stupid With Love 3
Teenage Dirtbag 3
Welcome To New York (Taylor’s Version) 3

Guilty pleasures (incognito mode)

Per year

Code
# Create a plot showing proportion of tracks listened in incognito mode by year
plot_incognito <- stream %>%
  group_by(year = as.factor(year(date))) %>%
  summarise(`Proportion of tracks` = mean(incognito_mode)) %>%
  ggplot(aes(x = year, y = `Proportion of tracks`)) +
  geom_col(fill=pal[1]) +
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  ggtitle("Proportion of tracks listened in incognito mode") +
  ylab("")

# Display plot
plot_incognito

Code
# Save plot
if(save_svg) ggsave(paste0("plot_incognito",".svg"), path = "output_files/")

Top incognito tracks

Playlists and Library tracks

Set up

Code
# Read and parse playlist data from JSON file
playlist <- fromJSON("input_files/Playlist1.json", flatten = TRUE)[[1]]

playlist_tracks <- data.frame()

# Loop through each playlist and extract tracks
for(i in 1:nrow(playlist)){
  playlist_tracks_i <- playlist$items[[i]]
  playlist_tracks_i$playlist_name <- playlist$name[i]
  playlist_tracks <- bind_rows(playlist_tracks, playlist_tracks_i)
}

# Read library data from JSON file
library <- fromJSON("input_files/YourLibrary.json", flatten = TRUE)[[1]]

User playlists

Code
# Process playlist tracks data
playlist_tracks <- playlist_tracks %>%
  mutate(
    added_date = as_datetime(addedDate, tz = "UTC"),
    artist_name = track.artistName,
    track_name = track.trackName,
    album_name = track.albumName
  )

# Get most common tracks in playlists
playlist_top_tracks <- playlist_tracks %>%
  group_by(track_name,artist_name) %>%
  summarise(n_playlists = n()) %>%
  arrange(desc(n_playlists))%>%
  ungroup()

# Display table
playlist_top_tracks[1:10,]%>%
  transmute(
    `Track`=track_name,
    `Artist` = artist_name,
    `N playlists` = n_playlists,
  )%>%
  kable(caption="Tracks included in more playlists")
Tracks included in more playlists
Track Artist N playlists
Heaven Is A Place On Earth Belinda Carlisle 5
Rebel Rebel - 2016 Remaster David Bowie 5
Ateo C. Tangana 4
Barbie Girl Aqua 4
Call It Fate, Call It Karma The Strokes 4
Dulce y Bautizada Samantha Hudson 4
El Fin del Mundo La La Love You 4
I Follow Rivers - The Magician Remix Lykke Li 4
I Need A Dollar Aloe Blacc 4
Lisa Young Miko 4
Code
# Get most common artists in playlists
playlist_top_artist<-playlist_tracks%>%
  group_by(artist_name)%>%
  summarise(n_tracks=n(),
            n_songs=n_distinct(track_name),
            n_playlists=n_distinct(playlist_name))%>%
  arrange(desc(n_playlists))%>%
  ungroup()

# Display table
playlist_top_artist[1:10,]%>%
  transmute(
    `Artist` = artist_name,
    `N tracks` = n_tracks,
    `N songs` = n_songs,
    `N playlists` = n_playlists,
  )%>%
  kable(caption="Artists included in more playlists")
Artists included in more playlists
Artist N tracks N songs N playlists
Taylor Swift 108 70 14
Miley Cyrus 16 10 9
Beyoncé 9 7 8
David Bowie 7 3 7
Madonna 9 9 7
Queen 9 6 7
Avril Lavigne 8 4 6
Bad Bunny 8 4 6
C. Tangana 9 4 6
La La Love You 6 3 6

User library

Code
# Get most common artists in library (saved songs)
library_tracks<-library%>%
  mutate(artist_name = artist,
         track_name = track,
         album_name = album)

library_top_artist<-library_tracks%>%
  group_by(artist_name)%>%
  summarise(n_songs=n_distinct(track_name))%>%
  arrange(desc(n_songs))%>%
  ungroup()

# Display table
library_top_artist[1:10,]%>%
  transmute(
    `Artist` = artist_name,
    `N songs` = n_songs,
  )%>%
  kable(caption="Artists with more songs in library")
Artists with more songs in library
Artist N songs
Taylor Swift 40
Charli xcx 12
Miley Cyrus 11
Mother Mother 10
Beyoncé 9
Chappell Roan 9
Måneskin 9
Rigoberta Bandini 8
Amaral 7
Bizarrap 7

Saved in playlist or library

Code
# Combine tracks from playlists and library 
saved_tracks <- unique(c(playlist_top_tracks$track_name, library_tracks$track_name))

# Combine artists from playlists and library
saved_artist <- unique(c(playlist_top_artist$artist_name, library_top_artist$artist_name))

# Plot proportion of tracks in playlists by year
plot_saved_track <- stream %>%
  mutate(`In my playlists` = ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%
  group_by(`In my playlists`, year = as.factor(year(date))) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = year, y = n, fill = `In my playlists`)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = pal, na.value = "grey80") +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  theme_minimal() +
  ggtitle("Were the tracks (streaming history) in the current playlists?") +
  ylab("")

# Display plot
plot_saved_track

Code
# Save plot
if(save_svg) ggsave(paste0("plot_saved_track",".svg"), path = "output_files/")
Code
# Plot proportion of artists in playlists by year
plot_saved_artist <- stream %>%
  mutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%
  group_by(`In my playlists`, year = as.factor(year(date))) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = year, y = n, fill = `In my playlists`)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = pal, na.value = "grey80") +
  scale_y_continuous(labels = function(x) paste0(x*100, "%"))+
  theme_minimal() +
  ggtitle("Were the artists (streaming history) in the current playlists?")+
  ylab("")


plot_saved_artist

Code
# Save plot
if(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path = "output_files/")