library(tidyverse)
library(glue)
library(scales)
library(showtext)
library(ggtext)
library(shadowtext)
library(maps)
library(ggpattern)
library(ggrepel)
library(patchwork)
library(tidylog)
font_add_google("Ubuntu", "Ubuntu", regular.wt = 400, bold.wt = 700)
showtext_auto()
showtext_opts(dpi = 300)About the Data
This week we are exploring TV show and movie viewing data from Netflix. Since 2023, Netflix has released regular Engagement Reports summarising the number of hours that users have spent watching each show and movie in the last 6 months.
This report, which captures ~99% of all viewing in the first half of 2025, shows that people watched a lot of Netflix — over 95B hours — spanning a wide range of genres and languages. It’s why we continue to invest in a variety of quality titles for various moods and tastes and work hard to make them great.
The dataset this week combines viewing data from late 2023 through the first half of 2025.
1 Initializing
1.1 Load libraries
1.2 Set theme
cool_gray0 <- "#323955"
cool_gray1 <- "#5a6695"
cool_gray2 <- "#7e89bb"
cool_gray3 <- "#a4aee2"
cool_gray4 <- "#cbd5ff"
cool_gray5 <- "#e7efff"
cool_red0 <- "#A31C44"
cool_red1 <- "#F01B5B"
cool_red2 <- "#F43E75"
cool_red3 <- "#E891AB"
cool_red4 <- "#FAC3D3"
cool_red5 <- "#FCE0E8"
theme_set(
theme_minimal() +
theme(
# axis.line.x.bottom = element_line(color = '#474747', linewidth = .3),
# axis.ticks.x= element_line(color = '#474747', linewidth = .3),
# axis.line.y.left = element_line(color = '#474747', linewidth = .3),
# axis.ticks.y= element_line(color = '#474747', linewidth = .3),
# # panel.grid = element_line(linewidth = .3, color = 'grey90'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.length = unit(-0.15, "cm"),
plot.background = element_blank(),
plot.title.position = "plot",
plot.title = element_text(family = "Ubuntu", size = 18, face = 'bold'),
plot.caption = element_text(
size = 8,
color = cool_gray3,
margin = margin(20, 0, 0, 0),
hjust = 0
),
plot.subtitle = element_text(
size = 9,
lineheight = 1.15,
margin = margin(5, 0, 15, 0)
),
axis.title.x = element_markdown(
family = "Ubuntu",
hjust = .5,
size = 8,
color = cool_gray1
),
axis.title.y = element_markdown(
family = "Ubuntu",
hjust = .5,
size = 8,
color = cool_gray1
),
axis.text = element_text(
family = "Ubuntu",
hjust = .5,
size = 8,
color = cool_gray1
),
legend.position = "top",
text = element_text(family = "Ubuntu", color = cool_gray1),
plot.margin = margin(25, 25, 25, 25)
)
)1.3 Load this week’s data
movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-07-29/movies.csv')
shows <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-07-29/shows.csv')
netflix <- bind_rows(movies |> mutate(type = 'movie'), shows |> mutate(type = 'show'))2 Data analysis
Let see the release date distribution
netflix |>
ggplot(aes(x = release_date)) +
geom_histogram(aes(fill = type)) 
It is correlated to the number of views?
netflix |>
ggplot(aes(x = release_date, y = views)) +
geom_point(aes(color = type))
3 Transform Data for Plotting
library(igraph)
library(tidytext)
library(igraph)
library(ggraph)
library(widyr)
library(qdapDictionaries)
english_words <- tibble(word = GradyAugmented)
data("stop_words")
blocklist_words <- c(
'season',
'limited',
'series',
'movie',
'trailer',
'de',
'la',
'el',
'los',
'las',
'les',
'di'
)
data2plot <-
netflix |>
filter(report == "2025Jan-Jun") |>
mutate(title = str_remove(title, " // .*")) |>
distinct(title) |>
mutate(id = row_number()) |>
mutate(id = as.character(id)) |>
unnest_tokens(word, title, drop = FALSE) |>
anti_join(stop_words, by = "word") |>
filter(!str_detect(word, "[0-9]")) |>
filter(!str_detect(word, "[[:punct:]]")) |>
semi_join(english_words, by = "word") |>
filter(!word %in% blocklist_words)
word_pairs <- data2plot |>
pairwise_count(word, id, sort = TRUE, upper = FALSE)
top_words <-
data2plot |>
count(word, sort = TRUE) |>
head(15) |>
mutate(rank = row_number())
# Filter for pairs that appear at least twice to reduce noise
filtered_pairs <- word_pairs |>
filter(item1 %in% top_words$word | item2 %in% top_words$word) |>
mutate(top_word = if_else(item1 %in% top_words$word, item1, item2)) |>
group_by(top_word) |>
slice_max(n, n = 5) |>
ungroup() |>
select(-top_word)
# --- Create Nodes
nodes <-
filtered_pairs |>
select(-n) |>
mutate(id = row_number()) |>
pivot_longer(names_to = 'names', values_to = 'word', -id) |>
select(-id, -names) |>
distinct(word) |>
left_join(top_words, by = 'word') |>
mutate(is_top = !is.na(rank)) |>
mutate(rank = if_else(is.na(rank), 16, rank))
# -- Create Edges
edges <-
filtered_pairs
# Create an igraph object from the filtered pairs
my_graph_df <- graph_from_data_frame(edges, vertices = nodes)4 Time to plot!
4.1 Attempt 1
I was tring to create an eye…
library(lubridate)
data2plot <-
netflix |>
filter(!is.na(release_date)) |>
filter(report == "2025Jan-Jun") |>
group_by(type) |>
slice_max(hours_viewed, n = 100) |>
arrange(type, hours_viewed) |>
mutate(order = -1 * row_number()) |>
mutate(order = if_else(type == 'show', order * -1, order)) |>
mutate(title = fct_reorder(title, order)) |>
ungroup()
data2plot |>
ggplot(aes(x = hours_viewed, y = title)) +
geom_segment(
aes(x = 0, xend = hours_viewed, color = type),
show.legend = FALSE
) +
coord_radial(theta = "y", inner.radius = .2, start = 90 * (pi / 180)) +
scale_color_manual(values = c(cool_gray1, cool_red1)) +
theme_void() +
theme(margin = margin(0, 0, 0, 0))
4.2 Final
ggraph(my_graph_df, layout = "fr") +
geom_edge_link(color = cool_gray5, show.legend = FALSE) +
geom_node_point(aes(size = is_top, color = n)) +
geom_node_text(
aes(label = name, filter = is_top),
repel = FALSE,
size = 2.5,
color = cool_gray0,
nudge_y = .2,
fontface = 'bold'
) +
geom_node_text(
aes(label = name, filter = !is_top),
repel = FALSE,
size = 2,
color = cool_gray1
) +
geom_node_text(aes(label = n), repel = FALSE, size = 2, , nudge_y = -.2) +
labs(
title = "Netflix top 15 words in titles",
subtitle = "and its 5 most common co-occurrences",
color = "Word Frequency",
caption = str_wrap(
"NOTE This visualization offers a preliminary look at the data and may not capture the full complexity of the underlying reality. SOURCE #Tidytuesday 2025-06-17 GITHUB barreiro-r",
width = 100,
)
) +
theme_void() +
scale_color_continuous(
low = cool_gray4,
high = cool_red2,
na.value = cool_gray5,
breaks = c(min(top_words$n), max(top_words$n))
) +
scale_size_manual(values = c(2, 9)) +
guides(
size = 'none',
color = 'none',
# color = guide_colorbar(
# barwidth = 7,
# barheight = .3
# )
) +
theme(
legend.position = 'top',
legend.direction = 'horizontal',
legend.title.position = 'top',
legend.title = element_text(size = 8, hjust = .5),
# the the update didint work
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.length = unit(-0.15, "cm"),
plot.background = element_blank(),
plot.title.position = "plot",
plot.title = element_text(family = "Ubuntu", size = 18, face = 'bold'),
plot.caption = element_text(
size = 8,
color = cool_gray3,
margin = margin(20, 0, 0, 0),
hjust = 0
),
plot.subtitle = element_text(
size = 9,
lineheight = 1.15,
margin = margin(5, 0, 15, 0)
),
text = element_text(family = "Ubuntu", color = cool_gray1),
plot.margin = margin(25, 25, 25, 25)
)

 x views-1.png)