计算时间间隔内的行数

Question

计算时间间隔内的行数

3

假设有以下数据框：

df <- tibble(ID = c(12, 12, 12, 13, 13, 13),
         times = c(as.POSIXct("2021-01-02 10:00:00"),
                   as.POSIXct("2021-01-02 11:00:00"),
                   as.POSIXct("2021-01-02 13:00:00"),
                   as.POSIXct("2021-01-02 13:00:00"),
                   as.POSIXct("2021-01-02 14:00:00"),
                   as.POSIXct("2021-01-02 15:00:00")))
        ID times              
  <dbl> <dttm>             
1    12 2021-01-02 10:00:00
2    12 2021-01-02 11:00:00
3    12 2021-01-02 13:00:00
4    13 2021-01-02 13:00:00
5    13 2021-01-02 14:00:00
6    13 2021-01-02 15:00:00

我想要的是一列数据，它以每个ID的时间戳为起始值，并计算连续出现下一个2小时内的观测次数。所以这就是我的目标:

     ID times               n_obs_within_2h
  <dbl> <dttm>                        <dbl>
1    12 2021-01-02 10:00:00               2
2    12 2021-01-02 11:00:00               2
3    12 2021-01-02 13:00:00               1
4    13 2021-01-02 13:00:00               3
5    13 2021-01-02 14:00:00               2
6    13 2021-01-02 15:00:00               1

我知道可以使用purrr::map来迭代每一行轻松完成此操作。然而，我的原始数据集非常大，这样做效率相对较低。你能想到除了逐行迭代之外的另一种方法来计算n_obs_within_2h吗？

编辑：我的当前尝试：

df %>% group_by(ID) %>% 
  mutate(n_obs_with_2h = purrr::pmap_dbl(.l = list(ID, times), 
                                         .f = function(i, t, data) {
                                           n <- data %>%
                                             filter(ID == i) %>%
                                             filter(between(as.double.difftime(times-t, units = "hours"),
                                                            0, 2)) %>%
                                             nrow()
                                           return(n)
                                         }, data = .))

- Ai4l2s

你能展示一下哪里比较慢吗？问题可能不在迭代中。 - harre

@harre 我已经添加了我的当前尝试。 - Ai4l2s

2个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Carl · Answer 1

也许可以采用向量化方法，使用滑动窗口来计算接下来2小时内的连续观测次数？

library(tidyverse)
library(lubridate)
library(slider)

df <- tibble(
  ID = c(12, 12, 12, 13, 13, 13),
  times = c(
    as.POSIXct("2021-01-02 10:00:00"),
    as.POSIXct("2021-01-02 11:00:00"),
    as.POSIXct("2021-01-02 13:00:00"),
    as.POSIXct("2021-01-02 13:00:00"),
    as.POSIXct("2021-01-02 14:00:00"),
    as.POSIXct("2021-01-02 15:00:00")
  )
)

df |>
  group_by(ID) |>
  mutate(
    diff = difftime(times, min(times), units = "hours"),
    within_2 = if_else(diff <= 2, 1, 0),
    n_obs_within_2h = slide_dbl(within_2, sum, .after = Inf)
  ) |> 
  ungroup()

#> # A tibble: 6 × 5
#>      ID times               diff    within_2 n_obs_within_2h
#>   <dbl> <dttm>              <drtn>     <dbl>           <dbl>
#> 1    12 2021-01-02 10:00:00 0 hours        1               2
#> 2    12 2021-01-02 11:00:00 1 hours        1               1
#> 3    12 2021-01-02 13:00:00 3 hours        0               0
#> 4    13 2021-01-02 13:00:00 0 hours        1               3
#> 5    13 2021-01-02 14:00:00 1 hours        1               2
#> 6    13 2021-01-02 15:00:00 2 hours        1               1

^{由 reprex package (v2.0.1) 于 2022-06-30 创建}

- harre · Answer 2

在这种情况下，使用映射中的另一种方法可能更有可能成为提高性能的关键。我们可以利用分组结构本身来代替对整个数据进行过滤，如下所示：

df |>
  group_by(ID) %>%
  mutate(n_obs_with_2h = purrr::map_dbl(times, ~ sum(difftime(times[ID == ID], ., units = "hours") <= 2 & difftime(times[ID == ID], ., units = "hours") >= 0))) %>%
  ungroup() 

# A tibble: 6 × 3
#     ID times               n_obs_with_2h
#  <dbl> <dttm>                      <dbl>
#     12 2021-01-02 10:00:00             2
#     12 2021-01-02 11:00:00             2
#     12 2021-01-02 13:00:00             1
#     13 2021-01-02 13:00:00             3
#     13 2021-01-02 14:00:00             2
#     13 2021-01-02 15:00:00             1

参考基准：（即使提供的数据太少，无法保证其可靠性。话虽如此，我预计在更大的数据集上速度会更快）

fun_original <- function(df) {
  
  df %>% group_by(ID) %>% 
    mutate(n_obs_with_2h = purrr::pmap_dbl(.l = list(ID, times), 
                                           .f = function(i, t, data) {
                                             n <- data %>%
                                               filter(ID == i) %>%
                                               filter(between(as.double.difftime(times-t, units = "hours"),
                                                              0, 2)) %>%
                                               nrow()
                                             return(n)
                                           }, data = .)) %>% ungroup()
  
}

fun_new <- function(df) {
  
  df |>
    group_by(ID) |>
    mutate(n_obs_with_2h = purrr::map_dbl(times, ~ sum(difftime(times[ID == ID], ., units = "hours") <= 2 & difftime(times[ID == ID], ., units = "hours") >= 0))) |>
    ungroup() 
  
}

bench::mark(fun_original(df), fun_new(df))

# A tibble: 2 × 13
#  expression            min   median `itr/sec` mem_alloc #`gc/sec` n_itr  n_gc total_time result  
#  <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>  
#  fun_original(df)  15.53ms  16.38ms      59.9   45.77KB     15.6    23     6      384ms <tibble>
#  fun_new(df)        1.74ms   1.95ms     486.     6.02KB     10.8   224     5      461ms <tibble>