使用累积函数在R中识别连续重复项。

4

让我分享一个例子,来说明我想做的事情。因为标题可能不太清晰。

data <- tibble(week=1:10,name=c(rep("Joe",10)),value=c(.9,.89,.99,.98,.87,.89,.93,.92,.98,.9),
               wanted = c("Yes","Skip","No","No","Yes","Skip","Yes","Skip","No","Yes"))

data <- data %>% mutate(my_attempt = case_when( week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          lag(value,2) >= .95 &
                                          !is.na(lag(value,2))~ "Skip",
                                        week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          is.na(lag(value,2))~ "Skip",
                                        value < .95 ~"Yes",
                                        TRUE ~ "No"))    

 #   week name  value wanted my_attempt
 #  <int> <chr> <dbl> <chr>  <chr>     
 #     1 Joe    0.9  Yes    Yes       
 #     2 Joe    0.89 Skip   Skip      
 #     3 Joe    0.99 No     No        
 #     4 Joe    0.98 No     No        
 #     5 Joe    0.87 Yes    Yes       
 #     6 Joe    0.89 Skip   Skip      
 #     7 Joe    0.93 Yes    Yes       
 #     8 Joe    0.92 Skip   Yes       
 #     9 Joe    0.98 No     No        
 #    10 Joe    0.9  Yes    Yes    

我试图让my_attempt列输出wanted列的结果。我想要识别值小于某个阈值时的行,但不能有两个连续的“yes”值。我尝试过,但是当出现4个或更多连续的低值时就失效了。在我的实际数据中,一些周可能会缺失,但可以视为“否”。例如,如果第6周缺失,则第7周仍然可以是“Yes”(我认为这种情况下的第一行已经解决了这个问题)。是否有一种方法可以在R中实现这一点?它不必与dplyr一致,但如果能够在tidyverse中实现的话就更好了。

3个回答

2

我认为你可以在这里使用purrr:accumulate()

library(purrr)
library(dplyr)

data%>%mutate(my_attempt = ifelse(week-lag(week, default = 0)==1 & 
                                          value < .95,
                                  'Yes', 'No')%>%
        accumulate(~ifelse(.x==.y & .y=='Yes', 'Skip', .y)))

# A tibble: 10 x 5
    week name  value wanted my_attempt
   <int> <chr> <dbl> <chr>  <chr>     
 1     1 Joe    0.9  Yes    Yes       
 2     2 Joe    0.89 Skip   Skip      
 3     3 Joe    0.99 No     No        
 4     4 Joe    0.98 No     No        
 5     5 Joe    0.87 Yes    Yes       
 6     6 Joe    0.89 Skip   Skip      
 7     7 Joe    0.93 Yes    Yes       
 8     8 Joe    0.92 Skip   Skip      
 9     9 Joe    0.98 No     No        
10    10 Joe    0.9  Yes    Yes 

1
这里优雅地使用了 accumulate - Anoushiravan R
2
从我们的朋友@AnilGoyal学到的 - GuedesBF
是的,我们的共同朋友很棒。 - Anoushiravan R
1
它完美地运行了!谢谢@AnoushiravanR。@GuedesBF,如果在示例中value小于0.95,则我希望my_attempt返回“Yes”。但是,如果上一周已经有了“Yes”,则我希望它返回“Skip”。 - syntax_error
1
我已经添加了一个基于 R 的解决方案和一个与之不太不同的 tidyverse 解决方案。 - Anoushiravan R
显示剩余2条评论

2

我会使用类似于 slider 的滚动计算库来完成这项任务,其中缺失的数据可以很好地进行索引。以下是修改后的数据示例:

library(tidyverse)
data <- tibble(week=c(1:5, 7:10),name=c(rep("Joe",9)),value=c(.9,.89,.99,.98,.87,.93,.92,.98,.9),
               wanted = c("Yes","Skip","No","No","Yes","Yes","Skip","No","Yes"))

data
#> # A tibble: 9 x 4
#>    week name  value wanted
#>   <int> <chr> <dbl> <chr> 
#> 1     1 Joe    0.9  Yes   
#> 2     2 Joe    0.89 Skip  
#> 3     3 Joe    0.99 No    
#> 4     4 Joe    0.98 No    
#> 5     5 Joe    0.87 Yes   
#> 6     7 Joe    0.93 Yes   
#> 7     8 Joe    0.92 Skip  
#> 8     9 Joe    0.98 No    
#> 9    10 Joe    0.9  Yes
library(slider)

data %>% group_by(name) %>%
  mutate(wanted2 = case_when(value < 0.95 & slide_index_lgl(.x = value, 
                                                            .i = week, 
                                                            .f = ~  any(.x < 0.95), 
                                                            .before = 1, 
                                                            .after = -1) ~ 'skip',
                             value < 0.95 ~ 'yes',
                             TRUE ~ 'no'))
#> # A tibble: 9 x 5
#> # Groups:   name [1]
#>    week name  value wanted wanted2
#>   <int> <chr> <dbl> <chr>  <chr>  
#> 1     1 Joe    0.9  Yes    yes    
#> 2     2 Joe    0.89 Skip   skip   
#> 3     3 Joe    0.99 No     no     
#> 4     4 Joe    0.98 No     no     
#> 5     5 Joe    0.87 Yes    yes    
#> 6     7 Joe    0.93 Yes    yes    
#> 7     8 Joe    0.92 Skip   skip   
#> 8     9 Joe    0.98 No     no     
#> 9    10 Joe    0.9  Yes    yes

即使不使用slider,也可以只使用dplyr完成。

library(dplyr)
data %>% group_by(name) %>%
  mutate(wanted2 = case_when(value < 0.95 & lag(value, default = 1) < 0.95 & week - 1 == lag(week, default = 0) ~ 'Skip',
                             value < 0.95 ~ 'Yes',
                             TRUE ~ 'No'))

#> # A tibble: 9 x 5
#> # Groups:   name [1]
#>    week name  value wanted wanted2
#>   <int> <chr> <dbl> <chr>  <chr>  
#> 1     1 Joe    0.9  Yes    Yes    
#> 2     2 Joe    0.89 Skip   Skip   
#> 3     3 Joe    0.99 No     No     
#> 4     4 Joe    0.98 No     No     
#> 5     5 Joe    0.87 Yes    Yes    
#> 6     7 Joe    0.93 Yes    Yes    
#> 7     8 Joe    0.92 Skip   Skip   
#> 8     9 Joe    0.98 No     No     
#> 9    10 Joe    0.9  Yes    Yes

reprex 包(版本2.0.0)于2021年07月25日创建


1
这里是一个简单的dplyr解决方案:
library(dplyr)

data %>%
  mutate(grp = cummax(week - lag(week, default = 0))) %>%
  group_by(name, grp) %>%
  mutate(my_attempt = ifelse(value < 0.95 & lag(value, default = 1) < 0.95, "Skip", 
                             ifelse(value < 0.95 & lag(value, default = 1) >= 0.95, 
                                    "Yes", "No")))

# A tibble: 9 x 6
# Groups:   name, grp [2]
   week name  value wanted   grp my_attempt
  <int> <chr> <dbl> <chr>  <dbl> <chr>     
1     1 Joe    0.9  Yes        1 Yes       
2     2 Joe    0.89 Skip       1 Skip      
3     3 Joe    0.99 No         1 No        
4     4 Joe    0.98 No         1 No        
5     5 Joe    0.87 Yes        1 Yes       
6     7 Joe    0.93 Yes        2 Yes       
7     8 Joe    0.92 Skip       2 Skip      
8     9 Joe    0.98 No         2 No        
9    10 Joe    0.9  Yes        2 Yes 

以下是如何在具有缺失周值的数据集上使用base::Reduce完成此操作。首先,我根据周值之间的差异创建了一个分组变量grp,然后基于该分组变量对数据集进行了拆分。之后,在每个块上应用了我们的函数,并使用rbind绑定了结果:

do.call(rbind, lapply(split(data, cummax(abs(data$week - c(0, data$week[-nrow(data)]))), data$name), 
                      \(x){
                        x$my_attept <- Reduce(function(a, b) {
                          if(x$value[b] < 0.95 & a != "Yes") {
                            "Yes"
                          } else if(x$value[b] < 0.95 & a == "Yes") {
                            "Skip"
                          } else {
                            "No"
                          }
                        }, 2:nrow(x), init = ifelse(x$value[1] < 0.95, "Yes", "No"), accumulate = TRUE)
                        x
                      }))

# A tibble: 9 x 5
   week name  value wanted my_attept
* <int> <chr> <dbl> <chr>  <chr>    
1     1 Joe    0.9  Yes    Yes      
2     2 Joe    0.89 Skip   Skip     
3     3 Joe    0.99 No     No       
4     4 Joe    0.98 No     No       
5     5 Joe    0.87 Yes    Yes      
6     7 Joe    0.93 Yes    Yes      
7     8 Joe    0.92 Skip   Skip     
8     9 Joe    0.98 No     No       
9    10 Joe    0.9  Yes    Yes 

如果您的数据中存在缺失的周数,就像这里修改后的数据集一样,您可以使用以下解决方案。我们首先根据它们的连续值对周进行分组,然后在每个组上应用我们的解决方案:
data %>%
  mutate(grp = cummax(week - lag(week, default = 0))) %>%
  group_by(name, grp) %>%
  mutate(my_attept = accumulate(value[-1], .init = ifelse(value[1] < 0.95, "Yes", "No"),
                                ~ if(.y < 0.95 & .x != "Yes") {
                                  "Yes"
                                } else if(.y < 0.95 & .x == "Yes") {
                                  "Skip"
                                } else {
                                  "No"
                                }))

# A tibble: 9 x 6
# Groups:   grp [2]
   week name  value wanted   grp my_attept
  <int> <chr> <dbl> <chr>  <dbl> <chr>    
1     1 Joe    0.9  Yes        1 Yes      
2     2 Joe    0.89 Skip       1 Skip     
3     3 Joe    0.99 No         1 No       
4     4 Joe    0.98 No         1 No       
5     5 Joe    0.87 Yes        1 Yes      
6     7 Joe    0.93 Yes        2 Yes      
7     8 Joe    0.92 Skip       2 Skip     
8     9 Joe    0.98 No         2 No       
9    10 Joe    0.9  Yes        2 Yes 

数据

structure(list(week = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L), 
    name = c("Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", 
    "Joe", "Joe"), value = c(0.9, 0.89, 0.99, 0.98, 0.87, 0.93, 
    0.92, 0.98, 0.9), wanted = c("Yes", "Skip", "No", "No", "Yes", 
    "Yes", "Skip", "No", "Yes")), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"))

1
请在group_by中包含name,亲爱的朋友。 - AnilGoyal

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接