如何按组用第一个非缺失值填充缺失值?

3
我有以下数据结构:
  library(dplyr)

  test_data <- data.frame(some_dimension = c(rep("first",6),rep("second",6)),
                          first_col = c(rep(NA,3),rep(1,3),rep(NA,3),rep(0,3)),
                          second_col = c(rep(NA,3),rep(0,3),rep(NA,3),rep(1,3)),
                          third_col = c(rep(NA,3),rep(1,3),rep(NA,3),rep(1,3)))

      some_dimension first_col second_col third_col
1           first        NA         NA        NA
2           first        NA         NA        NA
3           first        NA         NA        NA
4           first         1          0         1
5           first         1          0         1
6           first         1          0         1
7          second        NA         NA        NA
8          second        NA         NA        NA
9          second        NA         NA        NA
10         second         0          1         1
11         second         0          1         1
12         second         0          1         1

我想要获取以下数据结构:
  expexted_data <- data.frame(some_dimension = c(rep("first",6),rep("second",6)),
                          first_col = c(rep(0,3),rep(1,3),rep(1,3),rep(0,3)),
                          second_col = c(rep(1,3),rep(0,3),rep(0,3),rep(1,3)),
                          third_col = c(rep(0,3),rep(1,3),rep(0,3),rep(1,3)))


     some_dimension first_col second_col third_col
1           first         0          1         0
2           first         0          1         0
3           first         0          1         0
4           first         1          0         1
5           first         1          0         1
6           first         1          0         1
7          second         1          0         0
8          second         1          0         0
9          second         1          0         0
10         second         0          1         1
11         second         0          1         1
12         second         0          1         1

我希望用缺失值所在组的第一个非缺失值的相反数(值范围在0和1之间)来填补缺失值。

我尝试过以下方法,基本上是查找所有非缺失值并取最小索引。但我在正确应用该函数方面有些困难:

my_fun <- function(x){
   all_non_missings <- which(!is.na(x))
   first_non_missing <- min(all_non_missings)
   if(.data[first_non_missing] == 1){
    is.na(x) <- rep(0, length.out = length(x))
  } else {
    is.na(x) <- rep(1, length.out = length(x))
  }
}

test_data %>% group_by(some_dimension) %>% mutate_if(is.numeric, funs(new = my_fun(.)))

我经常遇到一些错误,例如:

Error in mutate_impl(.data, dots): Evaluation error: (list) object cannot be coerced to type 'double'. Traceback:

3个回答

2
尝试使用“zoo”包中的na.locf函数:
library(zoo)
test_data %>%
   group_by(some_dimension) %>% 
   mutate_if(is.numeric,funs(ifelse(is.na(.),1-na.locf(.,fromLast=TRUE),.)))
#   some_dimension first_col second_col third_col
#1           first         0          1         0
#2           first         0          1         0
#3           first         0          1         0
#4           first         1          0         1
#5           first         1          0         1
#6           first         1          0         1
#7          second         1          0         0
#8          second         1          0         0
#9          second         1          0         0
#10         second         0          1         1
#11         second         0          1         1
#12         second         0          1         1

或者更短:

test_data %>% 
  group_by(some_dimension) %>%
  mutate_if(is.numeric,funs(coalesce(.,1-na.locf(.,fromLast=TRUE))))

1
这是您发布的例子的解决方案:

test_data <- data.frame(some_dimension = c(rep("first",6),rep("second",6)),
                        first_col = c(rep(NA,3),rep(1,3),rep(NA,3),rep(0,3)),
                        second_col = c(rep(NA,3),rep(0,3),rep(NA,3),rep(1,3)),
                        third_col = c(rep(NA,3),rep(1,3),rep(NA,3),rep(1,3)))

library(dplyr)

test_data %>%
  group_by(some_dimension) %>%
  mutate_all(~ifelse(is.na(.), 1-unique(.[!is.na(.)]), .)) %>%
  ungroup()

# # A tibble: 12 x 4
#   some_dimension first_col second_col third_col
#   <fct>              <dbl>      <dbl>     <dbl>
# 1 first                  0          1         0
# 2 first                  0          1         0
# 3 first                  0          1         0
# 4 first                  1          0         1
# 5 first                  1          0         1
# 6 first                  1          0         1
# 7 second                 1          0         0
# 8 second                 1          0         0
# 9 second                 1          0         0
#10 second                 0          1         1
#11 second                 0          1         1
#12 second                 0          1         1

1

data.table

setDT(test_data)[, lapply(.SD, function(x){x[is.na(x)]<-(1 - as.integer(mean(x, na.rm = T)));x}) , by = some_dimension][]

#    some_dimension first_col second_col third_col
# 1:          first         0          1         0
# 2:          first         0          1         0
# 3:          first         0          1         0
# 4:          first         1          0         1
# 5:          first         1          0         1
# 6:          first         1          0         1
# 7:         second         1          0         0
# 8:         second         1          0         0
# 9:         second         1          0         0
#10:         second         0          1         1
#11:         second         0          1         1
#12:         second         0          1         1

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接