R中使用facet_wrap进行bootstrap回归。

10

我一直在使用mtcars数据集进行练习。

我用线性模型创建了这个图表。

library(tidyverse)
library(tidymodels)

ggplot(data = mtcars, aes(x = wt, y = mpg)) + 
  geom_point() + geom_smooth(method = 'lm')

输入图像描述

然后我将数据框转换为长数据框,以便尝试使用facet_wrap。

mtcars_long_numeric <- mtcars %>%
  select(mpg, disp, hp, drat, wt, qsec) 

mtcars_long_numeric <- pivot_longer(mtcars_long_numeric, names_to = 'names', values_to = 'values', 2:6)

输入图像描述

现在我想学习关于geom_smooth的标准误差,并且看看是否可以使用自助法生成置信区间。

我在RStudio整洁模型文档中找到了这段代码,链接在此处

boots <- bootstraps(mtcars, times = 250, apparent = TRUE)
boots

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ wt, analysis(split))
}

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")

boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

ggplot(boot_aug, aes(wt, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 

mtcars %>%
按cyl分组 %>%
汇总(x = mpg 0.25和0.75分位数,y = mpg的四分位距) %>%
筛选cyl == 8 %>%
变异(z = x - y)

是否有方法使引导回归也成为facet_wrap? 我尝试将长数据框放入bootstrap函数中。

boots <- bootstraps(mtcars_long_numeric, times = 250, apparent = TRUE)
boots

fit_nls_on_bootstrap <- function(split) {
    group_by(names) %>%
    lm(mpg ~ values, analysis(split))
}

但是这样不起作用。

或者我尝试在这里添加一个group_by:

boot_models <-
  boots %>% 
  group_by(names) %>%
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

这不起作用是因为 boots$names 不存在。我也无法在 boot_aug 中添加分组作为 facet_wrap,因为那里也不存在 names。

ggplot(boot_aug, aes(values, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
    facet_wrap(~names) +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 

此外,我还学到不能通过~id进行facet_wrap。最终生成的图形如下,非常难以阅读!我真的很想在像'wt'、'disp'、'qsec'这样的元素上使用facet_wrap,而不是在每个引导自身上使用。

enter image description here

这是我正在使用的比较高级代码的其中一种情况——感激任何建议。

这是我期望的输出图片。除了标准误差区域的阴影部分之外,我希望看到占用相同区域的引导回归模型,或多或少。

enter image description here


你能分享一个你希望输出结果看起来像什么的图片吗?谢谢! - Skaqqs
我添加了一个短段落和一张图片。希望这样可以澄清问题。我正在尝试 facet_wrap 线性模型,并且不想使用 ggplot 中 geom_smooth(se = TRUE) 部分中带阴影区域的标准误差,而是希望用自助法回归的结果来填充该区域。您可以在我的帖子中看到,第一个图形成为第二个图形的其中一个部分。我希望第三个图形可以成为我无法创建的第四个图形的其中一个部分! - hachiko
你的问题是否专门关于如何使用 facet_wrap(),还是类似 ggarrange() 的方法也可以?后者更简单易行,但集成度更低、更不紧凑。如果你愿意,我可以发布一个使用非 facet 方式的示例作为答案。 - Skaqqs
我很欣赏这个想法。我正努力研究tidyverse和tidymodel方法以便理解它们,因此我希望能使用这种结构作为解决方案,尽管可能在这里并非完全可行:/ - hachiko
3个回答

5
如果你想继续使用tidyverse,类似这样的东西可能会有用:
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(broom)

set.seed(42)

n_boot <- 1000

mtcars %>% 
  select(-c(cyl, vs:carb)) %>% 
  pivot_longer(-mpg) -> tbl_mtcars_long

tbl_mtcars_long %>% 
  nest(model_data = c(mpg, value)) %>% 
  # for mpg and value observations within each level of name (e.g., disp, hp, ...)
  mutate(plot_data = map(model_data, ~ {
    # calculate information about the observed mpg and value observations
    # within each level of name to be used in each bootstrap sample
    submodel_data <- .x
    n <- nrow(submodel_data)
    min_x <- min(submodel_data$value)
    max_x <- max(submodel_data$value)
    pred_x <- seq(min_x, max_x, length.out = 100)
    
    # do the bootstrapping by
    # 1) repeatedly sampling samples of size n with replacement n_boot times,
    # 2) for each bootstrap sample, fit a model, 
    # 3) and make a tibble of predictions
    # the _dfr means to stack each tibble of predictions on top of one another
    map_dfr(1:n_boot, ~ {
      submodel_data %>% 
        sample_n(n, TRUE) %>% 
        lm(mpg ~ value, .) %>% 
        # suppress augment() warnings about dropping columns
        { suppressWarnings(augment(., newdata = tibble(value = pred_x))) }
    }) %>% 
      # the bootstrapping is finished at this point
      # now work across bootstrap samples at each value
      group_by(value) %>% 
      # to estimate the lower and upper 95% quantiles of predicted mpgs
      summarize(l = quantile(.fitted, .025),
                u = quantile(.fitted, .975),
                .groups = "drop"
      ) %>% 
      arrange(value)
  })) %>% 
  select(-model_data) %>% 
  unnest(plot_data) -> tbl_plot_data

ggplot() + 
  # observed points, model, and se
  geom_point(aes(value, mpg), tbl_mtcars_long) + 
  geom_smooth(aes(value, mpg), tbl_mtcars_long, 
              method = "lm", formula = "y ~ x", alpha = 0.25, fill = "red") + 
  # overlay bootstrapped se for direct comparison
  geom_ribbon(aes(value, ymin = l, ymax = u), tbl_plot_data, 
              alpha = 0.25, fill = "blue") + 
  facet_wrap(~ name, scales = "free_x")

这是由 reprex package(v1.0.0)在2021年07月19日创建的。


1
是的,我对@dww的回答的简洁性印象深刻。通过仅绘制引导模型而不估计置信区间,代码可以大大缩短。 - the-mad-statter

5
也许是这样的:
library(data.table)
mt = as.data.table(mtcars_long_numeric)

# helper function to return lm coefficients as a list
lm_coeffs = function(x, y) {
  coeffs = as.list(coefficients(lm(y~x)))
  names(coeffs) = c('C', "m")
  coeffs
}

# generate bootstrap samples of slope ('m') and intercept ('C')
nboot = 100
mtboot = lapply (seq_len(nboot), function(i) 
  mt[sample(.N,.N,TRUE), lm_coeffs(values, mpg), by=names])
mtboot = rbindlist(mtboot)

# and plot:    
ggplot(mt, aes(values, mpg)) +
  geom_abline(aes(intercept=C, slope=m), data = mtboot, size=0.3, alpha=0.3, color='forestgreen') +
  stat_smooth(method = "lm", colour = "red", geom = "ribbon", fill = NA, size=0.5, linetype='dashed') +
  geom_point() +
  facet_wrap(~names, scales = 'free_x')

enter image description here

对于那些更喜欢使用dplyr(不包括我)的人,这里是相同逻辑转换为该格式的方式:

lm_coeffs = function(x, y) {
  coeffs = coefficients(lm(y~x))
  tibble(C = coeffs[1], m=coeffs[2])
}

mtboot = lapply (seq_len(nboot), function(i) 
  mtcars_long_numeric %>%
    group_by(names) %>%
    slice_sample(prop=1, replace=TRUE) %>% 
    summarise(tibble(lm_coeffs2(values, mpg))))
mtboot = do.call(rbind, mtboot)

哇,这部分是在做什么啊? mt[sample(.N,.N,TRUE), lm_coeffs(values, mpg), by=names][,i:=I]) 这里的“.N”是什么?我很难看出它在做什么。 - hachiko
.N是组中行数的数量。它是data.tablen()在dplyr中的等效物。这行代码使用替换方式对这些行进行了.N长度的样本抽样 - 这就是自助法的定义方式。 - dww

2

我想我找到了解决这个问题的答案。但我仍希望得到帮助,让代码更加简洁 - 你可以看到我复制了很多内容。

mtcars_mpg_wt <- mtcars %>%
  select(mpg, wt)

boots <- bootstraps(mtcars_mpg_wt, times = 250, apparent = TRUE)
boots

# glimpse(boots) 
# dim(mtcars)

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ wt, analysis(split))
}

library(purrr)

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")




boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

# boot_aug <- 
#   boot_models %>% 
#   sample_n(200) %>% 
#   mutate(augmented = map(model, augment)) %>% 
#   unnest(augmented)

# boot_aug
glimpse(boot_aug)


ggplot(boot_aug, aes(wt, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 


boot_aug_1 <- boot_aug %>%
  mutate(factor = "wt")






mtcars_mpg_disp <- mtcars %>%
  select(mpg, disp)

boots <- bootstraps(mtcars_mpg_disp, times = 250, apparent = TRUE)
boots

# glimpse(boots) 
# dim(mtcars)

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ disp, analysis(split))
}

library(purrr)

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")




boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

# boot_aug <- 
#   boot_models %>% 
#   sample_n(200) %>% 
#   mutate(augmented = map(model, augment)) %>% 
#   unnest(augmented)

# boot_aug
glimpse(boot_aug)


ggplot(boot_aug, aes(disp, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ disp \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 



boot_aug_2 <- boot_aug %>%
  mutate(factor = "disp")




mtcars_mpg_drat <- mtcars %>%
  select(mpg, drat)

boots <- bootstraps(mtcars_mpg_drat, times = 250, apparent = TRUE)
boots

# glimpse(boots) 
# dim(mtcars)

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ drat, analysis(split))
}

library(purrr)

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")




boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

# boot_aug <- 
#   boot_models %>% 
#   sample_n(200) %>% 
#   mutate(augmented = map(model, augment)) %>% 
#   unnest(augmented)

# boot_aug
glimpse(boot_aug)


ggplot(boot_aug, aes(drat, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 


boot_aug_3 <- boot_aug %>%
  mutate(factor = "drat")








mtcars_mpg_hp <- mtcars %>%
  select(mpg, hp)

boots <- bootstraps(mtcars_mpg_hp, times = 250, apparent = TRUE)
boots

# glimpse(boots) 
# dim(mtcars)

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ hp, analysis(split))
}

library(purrr)

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")




boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

# boot_aug <- 
#   boot_models %>% 
#   sample_n(200) %>% 
#   mutate(augmented = map(model, augment)) %>% 
#   unnest(augmented)

# boot_aug
glimpse(boot_aug)


ggplot(boot_aug, aes(hp, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 


boot_aug_4 <- boot_aug %>%
  mutate(factor = "hp")











mtcars_mpg_qsec <- mtcars %>%
  select(mpg, qsec)

boots <- bootstraps(mtcars_mpg_qsec, times = 250, apparent = TRUE)
boots

# glimpse(boots) 
# dim(mtcars)

fit_nls_on_bootstrap <- function(split) {
    lm(mpg ~ qsec, analysis(split))
}

library(purrr)

boot_models <-
  boots %>% 
  dplyr::mutate(model = map(splits, fit_nls_on_bootstrap),
         coef_info = map(model, tidy))

boot_coefs <- 
  boot_models %>% 
  unnest(coef_info)

percentile_intervals <- int_pctl(boot_models, coef_info)
percentile_intervals

ggplot(boot_coefs, aes(estimate)) +
  geom_histogram(bins = 30) +
  facet_wrap( ~ term, scales = "free") +
  labs(title="", subtitle = "mpg ~ wt - Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "95% Confidence Interval Parameter Estimates, Intercept + Estimate") +
  geom_vline(aes(xintercept = .lower), data = percentile_intervals, col = "blue") +
  geom_vline(aes(xintercept = .upper), data = percentile_intervals, col = "blue")




boot_aug <- 
  boot_models %>% 
  sample_n(50) %>% 
  mutate(augmented = map(model, augment)) %>% 
  unnest(augmented)

# boot_aug <- 
#   boot_models %>% 
#   sample_n(200) %>% 
#   mutate(augmented = map(model, augment)) %>% 
#   unnest(augmented)

# boot_aug
glimpse(boot_aug)


ggplot(boot_aug, aes(qsec, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = "mpg ~ wt \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") 


boot_aug_5 <- boot_aug %>%
  mutate(factor = "qsec")


boot_aug_total <- bind_rows(boot_aug_1, boot_aug_2, boot_aug_3, boot_aug_4, boot_aug_5)

boot_aug_total <- boot_aug_total %>%
  select(disp, drat, hp, qsec, wt, mpg, .fitted, id, factor)

boot_aug_total_2 <- pivot_longer(boot_aug_total, names_to = 'names', values_to = 'values', 1:5)

boot_aug_total_2 <- boot_aug_total_2 %>%
  drop_na()
  


ggplot(boot_aug_total_2, aes(values, mpg)) +
  geom_line(aes(y = .fitted, group = id), alpha = .3, col = "blue") +
  geom_point(alpha = 0.005) +
  # ylim(5,25) +
    labs(title="", subtitle = " \n Linear Regression Bootstrap Resampling") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(caption = "coefficient stability testing") +
  facet_wrap(~factor, scales = 'free')

这里输入图片描述

对比

这里输入图片描述


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接