这是对
dplyr
的非常拙劣的使用,但可能符合其精神。
> df %>% mutate(m = do.call(pmin, select(df, ends_with("_num"))))
id sth1 tg1_num sth2 tg2_num others m
1 1 dave 2 ca 35 new 2
2 2 tom 5 tn -3 old -3
3 3 jane -3 al 0 new -3
4 4 leroy 0 az 25 old 0
5 5 jerry 4 mi 55 old 4
从那里,您可以添加一个 filter(m >= 0)
来获得您想要的答案。如果存在类似于 rowMeans
的 rowMins
,那么这将显着简化此过程。
> rowMins <- function(df) { do.call(pmin, df) }
> df %>% mutate(m = rowMins(select(df, ends_with("_num"))))
id sth1 tg1_num sth2 tg2_num others m
1 1 dave 2 ca 35 new 2
2 2 tom 5 tn -3 old -3
3 3 jane -3 al 0 new -3
4 4 leroy 0 az 25 old 0
5 5 jerry 4 mi 55 old 4
我不知道这是否高效。嵌套select
看起来真的很丑。
编辑3:借鉴了其他解决方案/评论的想法(感谢@Vlo),我可以大大加快我的速度(不幸的是,类似的优化会使@Vlo的解决方案速度更快(编辑4:哎呀,读错图表了,我是最快的,好的,就不再讨论了))
df %>% select(ends_with("_num")) %>% rowMins %>% {df[. >= 0,]}
编辑:出于好奇,我对一些解决方案进行了微基准测试(编辑2:添加了更多的解决方案)
microbenchmark(rowmins(df), rowmins2(df), reducer(df), sapplyer(df), grepapply(df), tchotchke(df), withrowsums(df), reducer2(df))
Unit: microseconds
expr min lq mean median uq max
rowmins(df) 1373.452 1431.9700 1732.188 1576.043 1729.410 5147.847
rowmins2(df) 836.885 875.9900 1015.364 913.285 1038.729 2510.339
reducer(df) 990.096 1058.6645 1217.264 1201.159 1297.997 3103.809
sapplyer(df) 14119.236 14939.8755 16820.701 15952.057 16612.709 66023.721
grepapply(df) 12907.657 13686.2325 14517.140 14485.520 15146.294 17291.779
tchotchke(df) 2770.818 2939.6425 3114.233 3036.926 3172.325 4098.161
withrowsums(df) 1526.227 1627.8185 1819.220 1722.430 1876.360 3025.095
reducer2(df) 900.524 943.1265 1087.025 1003.820 1109.188 3869.993
以下是我使用的定义:
rowmins <- function(df) {
df %>%
mutate(m = rowMins(select(df, ends_with("_num")))) %>%
filter(m >= 0) %>%
select(-m)
}
rowmins2 <- function(df) {
df %>% select(ends_with("_num")) %>% rowMins %>% {df[. >= 0,]}
}
reducer <- function(df) {
df %>%
select(matches("_num$")) %>%
lapply(">=", 0) %>%
Reduce(f = "&", .) %>%
which %>%
slice(.data = df)
}
reducer2 <- function(df) {
df %>%
select(matches("_num$")) %>%
lapply(">=", 0) %>%
Reduce(f = "&", .) %>%
{df[.,]}
}
sapplyer <- function(df) {
nums <- sapply(df, is.numeric)
df[apply(df[, nums], MARGIN=1, function(x) all(x >= 0)), ]
}
grepapply <- function(df) {
cond <- df[, grepl("_num$", colnames(df))] >= 0
df[apply(cond, 1, function(x) {prod(x) == 1}), ]
}
tchotchke <- function(df) {
pattern <- "_num$"
ind <- grep(pattern, colnames(df))
target_columns <- colnames(df)[ind]
desired_rows <- sapply(target_columns, function(x) which(df[,x]<0), simplify=TRUE)
as.vector(unique(unlist(desired_rows)))
}
withrowsums <- function(df) {
df %>% mutate(m=rowSums(select(df, ends_with("_num"))>0)) %>% filter(m==2) %>% select(-m)
}
df <- data.frame(id=1:10000, sth1=sample(LETTERS, 10000, replace=T), tg1_num=runif(10000,-1,1), tg2_num=runif(10000,-1, 1))
df %>% select(matches("_num$"))
? - Vlo