N <- 10000
d <- data.frame(
ID=seq(1, N),
v1=sample(c("M","F", "M", "L"), N, replace = TRUE),
v2=sample(c("D","M","D","D"), N, replace = TRUE),
v3=sample(c("F","G","F","E"), N, replace = TRUE),
v4=sample(c("A","B","A","B"), N, replace = TRUE)
)
使用data.table(最快)
dt <- data.table::as.data.table(d)
dt[, .N, by = c('v1','v2','v3','v4')]
使用dplyr
dplyr::count_(d, vars = c('v1','v2','v3','v4'))
使用plyr
plyr::count(d, vars = c('v1','v2','v3','v4'))
plyr::ddply(d, .variables = c('v1','v2','v3','v4'), nrow)
使用聚合函数(最慢)
aggregate(ID ~ ., d, FUN = length)
基准测试
microbenchmark::microbenchmark(dt[, .N, by = c('v1','v2','v3','v4')],
plyr::count(d, vars = c('v1','v2','v3','v4')),
plyr::ddply(d, .variables = c('v1','v2','v3','v4'), nrow),
dplyr::count_(d, vars = c('v1','v2','v3','v4')),
aggregate(ID ~ ., d, FUN = length),
times = 1000)
Unit: microseconds
expr min lq mean median uq max neval cld
dt[, .N, by = c("v1", "v2", "v3", "v4")] 887.807 1107.543 1263.777 1174.258 1289.724 4263.156 1000 a
plyr::count(d, vars = c("v1", "v2", "v3", "v4")) 3912.791 4270.387 5379.080 4498.053 5791.743 157146.103 1000 c
plyr::ddply(d, .variables = c("v1", "v2", "v3", "v4"), nrow) 7737.874 8553.370 10630.849 9018.266 11126.517 187301.696 1000 d
dplyr::count_(d, vars = c("v1", "v2", "v3", "v4")) 2126.913 2432.957 2763.499 2568.251 2789.386 12549.669 1000 b
aggregate(ID ~ ., d, FUN = length) 7395.440 8121.828 10546.659 8776.371 10858.263 210139.759 1000 d
使用data.table
似乎是最好的选择,因为它速度最快,无需其他函数或库进行计数。请注意,在大型数据集上,aggregate
函数的性能要慢得多。
最后说明:随时可以更新新方法。
args(plyr::count)
,我猜测plyr::count(d, c('v1','v2','v3','v4'))
可能是正确的。很可能还有一个可以考虑的dplyr::count
。 - Franklength(ID)
似乎不会给出正确的结果,因为ID
重复了一些值。通常最好测试一下方法的结果是否相等。哦算了,我想它包含什么值并不重要。顺便说一下,data.table 快速的原因在?GForce
中有说明。 - Franktapply(d$ID, d[, -1], length)
和table(d[, -1])
也很快。 - Frank