Pima Indian Diabetes Data: A Prospective Observational Cohort

density
Author

Rodrigo Barreiro

Published

November 11, 2025

Tip

If this code was useful to you, please consider give it a little star on GitHub.

About the Data

Note

Check the data in TidyTuesday GitHub repository.

This week we are exploring type 2 diabetes data from the Pima Indian community near Phoenix, Arizona. The study includes only women aged 21 and older, all of Pima heritage, with at least five years of follow-up. Each participant underwent regular oral glucose tolerance tests, and diabetes was diagnosed using WHO criteria.

People with type 2 diabetes mellitus (DM) become less sensitive to insulin. After a glucose load, both blood glucose and insulin levels rise, but glucose does not fall as quickly as it should—leading to sustained elevations. The incidence of type 2 DM is rising in many Western cultures, as increasingly unhealthy and calorie-rich diets become common.

1 Initializing

1.1 Load libraries

pacman::p_load(
  tidyverse,
  glue,
  scales,
  showtext,
  ggtext,
  shadowtext,
  maps,
  ggpattern,
  ggrepel,
  patchwork,
  tidylog
)

font_add_google("Ubuntu", "Ubuntu", regular.wt = 400, bold.wt = 700)
showtext_auto()
showtext_opts(dpi = 300)

1.2 Set theme

cool_gray0 <- "#323955"
cool_gray1 <- "#5a6695"
cool_gray2 <- "#7e89bb"
cool_gray3 <- "#a4aee2"
cool_gray4 <- "#cbd5ff"
cool_gray5 <- "#e7efff"

cool_red0 <- "#A31C44"
cool_red1 <- "#F01B5B"
cool_red2 <- "#F43E75"
cool_red3 <- "#E891AB"
cool_red4 <- "#FAC3D3"
cool_red5 <- "#FCE0E8"

theme_set(
  theme_minimal() +
    theme(
      # axis.line.x.bottom = element_line(color = 'cool_gray0', linewidth = .3),
      # axis.ticks.x= element_line(color = 'cool_gray0', linewidth = .3),
      # axis.line.y.left = element_line(color = 'cool_gray0', linewidth = .3),
      # axis.ticks.y= element_line(color = 'cool_gray0', linewidth = .3),
      # # panel.grid = element_line(linewidth = .3, color = 'grey90'),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.ticks.length = unit(-0.15, "cm"),
      plot.background = element_blank(),
      plot.title.position = "plot",
      plot.title = element_text(family = "Ubuntu", size = 14, face = 'bold'),
      plot.caption = element_markdown(
        size = 8,
        color = cool_gray3,
        margin = margin(20, 0, 0, 0),
        hjust = 0
      ),
      plot.subtitle = element_markdown(
        size = 9,
        lineheight = 1.15,
        margin = margin(5, 0, 15, 0)
      ),
      axis.title.x = element_markdown(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      axis.title.y = element_markdown(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      axis.text = element_text(
        family = "Ubuntu",
        hjust = .5,
        size = 8,
        color = cool_gray1
      ),
      legend.position = "top",
      text = element_text(family = "Ubuntu", color = cool_gray1),
      plot.margin = margin(25, 25, 25, 25)
    )
)

1.3 Load this week’s data

tuesdata <- tidytuesdayR::tt_load('2025-11-11')

2 Quick Exploratory Data Analysis

2.1 Completude

tuesdata$diabetes |>
  mutate(id = row_number()) |>
  pivot_longer(-id, names_to = "name", values_to = "value",values_transform = as.character ) |>
  group_by(name) |>
  count(is_na = is.na(value)) |>
  ggplot(aes(x = n, y = name)) +
  geom_col(aes(fill = is_na))

tuesdata$diabetes |>
  mutate(id = row_number()) |>
  pivot_longer(-c(id, diabetes_5y)) |>
  group_by(name) |>
  mutate(
    zscore = (value - mean(value, na.rm = TRUE)) / sd(value, na.rm = TRUE)
  ) |>
  ggplot(aes(x = name, y = zscore)) +
  ggbeeswarm::geom_quasirandom(
    aes(color = diabetes_5y),
    dodge.width = .7,
    size = .5,
    alpha = .1
  ) +
  stat_summary(
    aes(fill = diabetes_5y),
    geom = "point",
    fun = mean,
    size = 3,
    shape = 21,
    color = "white",
    position = position_dodge(width = .7)
  )

tuesdata$diabetes |>
  select(-diabetes_5y) |>
  cor(use = 'pairwise.complete.obs') |>
  ggcorrplot::ggcorrplot()

3 Transform Data for Plotting

data2plot <-
  tuesdata$diabetes |>
  mutate(id = row_number()) |>
  pivot_longer(-c(id, diabetes_5y)) |>
  group_by(name) |>
  mutate(
    zscore = (value - mean(value, na.rm = TRUE)) / sd(value, na.rm = TRUE)
  ) |>
  ungroup()

means_data2plot <-
  data2plot |>
  group_by(name, diabetes_5y) |>
  summarise(
    mean = mean(zscore, na.rm = TRUE),
    mean_original = mean(value, na.rm = TRUE)
  )


my_order <- means_data2plot |>
  select(name, diabetes_5y, mean) |>
  pivot_wider(names_from = diabetes_5y, values_from = mean) |>
  mutate(delta = abs(neg - pos)) |>
  arrange(delta) |>
  pull(name)

data2plot <- data2plot |>
  mutate(name = factor(name, levels = rev(my_order)))


pretty_names <- c(
  "pregnancy_num" = "**Number of Pregnancies**",
  "glucose_mg-dl" = "**Glucose** (mg/dL)",
  "dbp_mm-hg" = "**Diastolic Blood Pressure** (mmHg)",
  "triceps_mm" = "**Triceps Skinfold** (mm)",
  "insulin_microiu-ml" = "**Insulin** (microU/mL)",
  "bmi" = "**Body Mass Index**",
  "pedigree" = "**Pedigree**",
  "age" = "**Age** (years)")

pretty_names[my_order]
                            dbp_mm-hg                              pedigree 
"**Diastolic Blood Pressure** (mmHg)"                        "**Pedigree**" 
                        pregnancy_num                                   age 
          "**Number of Pregnancies**"                     "**Age** (years)" 
                           triceps_mm                    insulin_microiu-ml 
          "**Triceps Skinfold** (mm)"             "**Insulin** (microU/mL)" 
                                  bmi                         glucose_mg-dl 
                "**Body Mass Index**"                 "**Glucose** (mg/dL)" 

4 Time to plot!

4.1 Raw chart

data2plot |>
  ggplot(aes(x = zscore)) +
  geom_density(aes(fill = diabetes_5y), alpha = .2, color = NULL) +
  facet_wrap(~name, ncol = 2)

4.2 Final chart

data2plot |>
  ggplot(aes(x = zscore)) +
  geom_density(aes(fill = diabetes_5y), linewidth = 0, show.legend = FALSE) |>
    ggblend::blend("multiply") +
  geom_line(
    data = means_data2plot,
    aes(group = name, x = mean),
    y = 1,
    color = cool_gray4
  ) +
  geom_point(
    data = means_data2plot,
    aes(color = diabetes_5y, x = mean),
    y = 1,
    show.legend = FALSE
  ) +
  geom_text(
    data = means_data2plot |> filter(diabetes_5y == "pos"),
    aes(
      color = diabetes_5y,
      x = mean + 0.2,
      label = round(mean_original, digits = 1)
    ),
    y = 1,
    size = 2.5,
    hjust = 0,
    show.legend = FALSE
  ) +
  geom_text(
    data = means_data2plot |> filter(diabetes_5y == "neg"),
    aes(
      color = diabetes_5y,
      x = mean - 0.2,
      label = round(mean_original, digits = 1)
    ),
    y = 1,
    size = 2.5,
    hjust = 1,
    show.legend = FALSE,
    family = "Ubuntu"
  ) +
  facet_wrap(
    # ~ name,
    ~ factor(
      name,
      levels = rev(my_order),
      labels = pretty_names[rev(my_order)]
    ),
    ncol = 2,
    # labeller = labeller(name = pretty_names)
  ) +
  scale_y_continuous(limits = c(0, 1)) +
  scale_x_continuous(limits = c(-2, 3)) +
  scale_fill_manual(values = c(pos = cool_gray1, neg = cool_red2)) +
  scale_color_manual(values = c(pos = cool_gray1, neg = cool_red2)) +
  theme(
    axis.text = element_blank(),
    strip.text.x = element_markdown(
      hjust = 0,
      color = cool_gray1,
      size = 8
    ),
  ) +
  labs(
    y = NULL,
    title = "Pima Indian Diabetes Data",
    subtitle = str_wrap(
      'This week we are exploring type 2 diabetes data from the Pima Indian community near Phoenix, Arizona. The study includes only women aged 21 and older, all of Pima heritage, with at least five years of follow-up. Each participant underwent regular oral glucose tolerance tests, and diabetes was diagnosed using WHO criteria.',
      width = 105,
    ) |>
      str_replace_all("\\n", "<br>") |>
      str_c(
        '<br><br>**Mean value** (<span style="color:#7e89bb">No Diabetes</span>, <span style="color:#F43E75">Developed Diabetes</span>)'
      ),
    caption = str_c(
      "Glucose: Plasma glucose concentration at 2 hours after administration of an oral glucose tolerance test<br><br>",
      str_wrap(
        "NOTE This visualization offers a preliminary look at the data and may not capture the full complexity of the underlying reality. SOURCE #Tidytuesday 2025-11-11 GITHUB barreiro-r",
        width = 120,
      ) |>
        str_replace_all("\\n", "<br>")
    ),
    x = NULL,
    fill = NULL
  )