The Complete Sherlock Holmes – TidyTuesday by Rodrigo

Tip

If this code was useful to you, please consider give it a little star on GitHub.

About the Data

Note

Check the data in TidyTuesday GitHub repository.

This week we’re exploring the complete line-by-line text of the Sherlock Holmes stories and novels, made available through the {sherlock} R package by Emil Hvitfeldt. The dataset includes the full collection of Holmes texts, organized by book and line number, and is ideal for stylometry, sentiment analysis, and literary exploration.

“The name is Sherlock Holmes and the address is 221B Baker Street.” Holmes is a consulting detective known for his keen observation, logical reasoning, and use of forensic science to solve complex cases. Created by Sir Arthur Conan Doyle, Holmes has become one of the most famous fictional detectives in literature.

1 Initializing

1.1 Load libraries

pacman::p_load(
  tidyverse,
  glue,
  scales,
  showtext,
  ggtext,
  shadowtext,
  maps,
  ggpattern,
  ggrepel,
  patchwork,
  tidylog,
  tidytext,
  textdata,
  syuzhet
)

font_add_google("Ubuntu", "Ubuntu", regular.wt = 400, bold.wt = 700)
showtext_auto()
showtext_opts(dpi = 300)

1.2 Set theme

cool_gray0 <- "#323955"
cool_gray1 <- "#5a6695"
cool_gray2 <- "#7e89bb"
cool_gray3 <- "#a4aee2"
cool_gray4 <- "#cbd5ff"
cool_gray5 <- "#e7efff"

cool_red0 <- "#A31C44"
cool_red1 <- "#F01B5B"
cool_red2 <- "#F43E75"
cool_red3 <- "#E891AB"
cool_red4 <- "#FAC3D3"
cool_red5 <- "#FCE0E8"

theme_set(
  theme_minimal() +
    theme(
      # axis.line.x.bottom = element_line(color = 'cool_gray0', linewidth = .3),
      # axis.ticks.x= element_line(color = 'cool_gray0', linewidth = .3),
      # axis.line.y.left = element_line(color = 'cool_gray0', linewidth = .3),
      # axis.ticks.y= element_line(color = 'cool_gray0', linewidth = .3),
      # # panel.grid = element_line(linewidth = .3, color = 'grey90'),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.ticks.length = unit(-0.15, "cm"),
      plot.background = element_blank(),
      # plot.title.position = "plot",
      plot.title = element_text(family = "Ubuntu", size = 14, face = 'bold'),
      plot.caption = element_text(
        size = 8,
        color = cool_gray3,
        margin = margin(20, 0, 0, 0),
        hjust = 0
      ),
      plot.subtitle = element_markdown(
        size = 9,
        lineheight = 1.15,
        margin = margin(5, 0, 15, 0)
      ),
      axis.title.x = element_markdown(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      axis.title.y = element_markdown(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      axis.text = element_text(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      legend.position = "top",
      text = element_text(family = "Ubuntu", color = cool_gray1),
      plot.margin = margin(25, 25, 25, 25)
    )
)

1.3 Load this week’s data

tuesdata <- tidytuesdayR::tt_load('2025-11-18')

2 Quick Exploratory Data Analysis

2.1 Book Length

tuesdata$holmes |>
  count(book) |>
  mutate(book = fct_reorder(book, n)) |>
  ggplot(aes(x = n, y = book)) + 
  geom_col()

3 Transform Data for Plotting

library(tidytext)

# sherlock_tokens <- tuesdata$holmes |>
#   unnest_tokens(word, text) |>
#   anti_join(stop_words, by = "word")

# nrc_lexicon <- get_sentiments("nrc")

# sherlock_nrc <- sherlock_tokens |>
#   inner_join(nrc_lexicon, by = "word") |>
#   filter(!sentiment %in% c("positive", "negative")) |>
#   group_by(book) |>
#   mutate(word_order = row_number()) |>
#   # mutate(chunk = word_order %/% 20) |>
#   mutate(chunk = ntile(word_order, 100)) |>
#   ungroup() |>
#   count(book, sentiment, chunk) |>
#   group_by(book, chunk) |>
#   slice_max(order_by = n, n = 1, with_ties = FALSE) |>
#   ungroup()

# data2plot <-
#   sherlock_nrc |>
#   select(book, sentiment, chunk)

# data2plot_wide <-
#   data2plot |>
#   mutate(chunk = str_c("chunk", chunk)) |>
#   pivot_wider(values_from = sentiment, names_from = 'chunk') |>
#   column_to_rownames(var = "book") |>
#   mutate(across(everything(), as.factor))

# gower_dist <- cluster::daisy(data2plot_wide, metric = "gower")
# hc_books <- hclust(gower_dist, method = "complete")

# order <- hc_books$labels[hc_books$order]

# data2plot <-
#   mutate(data2plot, book = factor(book, levels = order))

scores <- tuesdata$holmes$text |> get_sentiment()

data2plot <- tuesdata$holmes |>
  mutate(sentiment = scores) |>
  group_by(book) |>
  mutate(chunk = ntile(line_num, 100)) |>
  group_by(book, chunk) |>
  summarise(sentiment = mean(sentiment, na.rm = TRUE)) |>
  ungroup()

# Select the three top and bottom books by mean sentiment
top3 <-
  data2plot |>
  group_by(book) |>
  summarise(sentiment = mean(sentiment, na.rm = TRUE)) |>
  ungroup() |>
  slice_max(order_by = sentiment, n = 3, with_ties = FALSE) |>
  pull(book)

bottom3 <-
  data2plot |>
  group_by(book) |>
  summarise(sentiment = mean(sentiment, na.rm = TRUE)) |>
  ungroup() |>
  slice_min(order_by = sentiment, n = 3, with_ties = FALSE) |>
  pull(book)

data2plot_wide <-
  data2plot |>
  filter(book %in% c(top3, bottom3)) |>
  mutate(chunk = str_c("chunk", chunk)) |>
  pivot_wider(values_from = sentiment, names_from = 'chunk') |>
  column_to_rownames(var = "book") |>
  mutate(across(everything(), as.factor))

# hc_books <- hclust(dist(data2plot_wide), method = "complete")
# order <- hc_books$labels[hc_books$order]

# data2plot <- data2plot |>
#   filter(book %in% c(top3, bottom3)) |>
#   mutate(book = factor(book, levels = order))

data2plot <- data2plot |>
  filter(book %in% c(top3, bottom3)) |>
  mutate(book = factor(book, levels = c(top3,bottom3)))

4 Time to plot!

4.1 Raw chart

data2plot |>
  ggplot(aes(x = chunk, y = sentiment)) +
  geom_line() +
  facet_wrap(~ book, ncol = 2) +
  theme_gray()

4.2 Final chart

data2plot |>
  ggplot(aes(x = chunk, y = sentiment)) +
  annotate(
    geom = "segment",
    y = 0,
    yend = 0,
    x = -Inf,
    xend = Inf,
    color = cool_gray5,
    linewidth = 0.5,
  ) +
  ggforce::geom_link2(aes(color = sentiment)) +
  geom_point(
    data = data2plot |>
      group_by(book) |>
      slice_max(sentiment, n = 1, with_ties = FALSE),
    aes(color = sentiment),
    size = .7
  ) +
  geom_point(
    data = data2plot |>
      group_by(book) |>
      slice_min(sentiment, n = 1, with_ties = FALSE),
    aes(color = sentiment),
    size = .7
  ) +
  facet_wrap(~book, ncol = 2) +
  theme(
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    strip.text.x = element_text(
      hjust = 0,
      color = cool_gray1,
      size = 7,
      family = "Ubuntu"
    ),
    legend.position = "bottom",
    legend.text = element_text(
      size = 7,
    ),
    legend.ticks = element_blank()
  ) +
  scale_color_gradientn(
    limits = c(
      -1 * max(abs(data2plot$sentiment)),
      1 * max(abs(data2plot$sentiment))
    ),
    # colors = MetBrewer::met.brewer("Troy"),
    colors = rev(c(
      cool_gray0,
      cool_gray1,
      cool_gray2,
      cool_red2,
      cool_red1,
      cool_red0
    )),
    breaks = c(-0.7, 0.7),
    labels = c("Negative", "Positive")
  ) +
  guides(
    color = guide_colorbar(
      barwidth = 9,
      barheight = .3
    )
  ) +
  labs(
    color = NULL,
    title = "Sherlock Holmes Sentiment Analysis",
    subtitle = str_wrap(
      'Sentiment analysis of the three most positive (left) and negative (right) books of Sherlock Holmes. Dots represent the maximum and minimum sentiment per book.',
      width = 60,
    ) |>
      str_replace_all("\n", "<br>"),
    caption = str_wrap(
      "NOTE This visualization offers a preliminary look at the data and may not capture the full complexity of the underlying reality. SOURCE #Tidytuesday 2025-11-18 GITHUB barreiro-r",
      width = 80,
    ),
  )