这是一个非常好的基准测试机会!以下是使用不同样本大小(N = 900、2700、10800)运行plyr
方法(由@agstudy建议)与data.table
方法(由Arun建议)进行比较。
总结:
data.table
方法的性能是plyr
方法的7.5倍。
plyrMethod <- quote({
dfw<-dcast(df,
formula = Par1+Par2~Type,
value.var="Val",
fun.aggregate=mean)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse='_'),
Num.pre =length(x$Type[x$Type =='pre']),
Num.post = length(x$Type[x$Type =='post']))
})
merge(dfw,dat)
})
dtMethod <- quote(
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
reduceMethod <- quote(
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
)
castddplyMethod <- quote(
reshape::cast(Par1 + Par2 + ParD ~ Type,
data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")),
fun = c(mean, length)
)
)
library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)
LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")
size <- 17568
size <- 10800
size <- 900
set.seed(1)
df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
, Type=rep(c("pre","post"), size/2)
, Val =sample(seq(10,100,10), size, TRUE)
)
dt <- data.table(df, key=c("Par1", "Par2"))
DF1 <- eval(plyrMethod)
DF2 <- eval(dtMethod)
colOrder <- sort(names(DF1))
DF1 <- DF1[, colOrder]
DF2 <- as.data.frame(DF2)[, colOrder]
DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
identical((DF1), (DF2))
结果
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
relative test elapsed user.self sys.self replications
1.000 reduce 0.392 0.375 0.018 5
1.003 dt 0.393 0.377 0.016 5
7.064 plyr 2.769 2.721 0.047 5
8.003 castddply 3.137 3.030 0.106 5
relative test elapsed user.self sys.self replications
1.000 dt 1.371 1.327 0.090 5
2.205 reduce 3.023 2.927 0.102 5
7.291 plyr 9.996 9.644 0.377 5
relative test elapsed user.self sys.self replications
1.000 dt 8.678 7.168 1.507 5
2.769 reduce 24.029 23.231 0.786 5
6.946 plyr 60.277 52.298 7.947 5
13.796 castddply 119.719 113.333 10.816 5
relative test elapsed user.self sys.self replications
1.000 dt 27.421 13.042 14.470 5
4.030 reduce 110.498 75.853 34.922 5
5.414 plyr 148.452 105.776 43.156 5
更新:已添加baseMethod1的结果
relative test elapsed user.self sys.self replications
1.000 dt 0.044 0.043 0.001 5
7.773 plyr 0.342 0.339 0.003 5
65.614 base1 2.887 2.866 0.028 5
Where
baseMethod1 <- quote({
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
})
更新2:将DT作为度量的一部分进行关键字索引
根据@MatthewDowle的评论,为了公平起见,将索引步骤添加到基准测试中。
然而,假设使用data.table,它将替代data.frame,并且索引将仅发生一次,而不仅仅是针对此过程。
dtMethod.withkey <- quote({
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
})
relative test elapsed user.self sys.self replications
1.000 dt 9.155 7.055 2.137 5
1.043 dt.withkey 9.553 7.245 2.353 5
3.567 reduce 32.659 31.196 1.586 5
6.703 plyr 61.364 54.080 7.600 5
更新 3:对于 @Arun 原始回答的编辑进行基准测试
dtMethod.MD1 <- quote(
dt[, list(pre=mean(Val[.pre <- Type=="pre"]),
post=mean(Val[.post <- Type=="post"]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
dtMethod.MD2 <- quote(
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
)
dtMethod.MD3 <- quote(
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) }
, by=list(Par1, Par2)]
)
benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
Comparing the different data.table methods amongst themselves
relative test elapsed user.self sys.self replications
1.000 dt.M3 0.198 0.197 0.001 5 <~~~ "list()" Method
1.242 dt.M1 0.246 0.243 0.004 5
1.253 dt.M2 0.248 0.242 0.007 5
1.884 dt 0.373 0.367 0.007 5
relative test elapsed user.self sys.self replications
1.000 dt.M3 33.492 24.487 9.122 5 <~~~ "list()" Method
1.086 dt.M1 36.388 11.442 25.086 5
1.086 dt.M2 36.388 10.845 25.660 5
1.126 dt 37.701 13.256 24.535 5
Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
Then re-ran in the *same* session, with reps=5. Results very different._
benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
relative test elapsed user.self sys.self replications
1.000 dt.M1 8.885 4.260 4.617 1
1.633 dt.M3 14.506 12.821 1.677 1
relative test elapsed user.self sys.self replications
1.000 dt.M1 33.443 10.200 23.226 5
1.048 dt.M3 35.060 26.127 8.915 5
New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_
relative test elapsed user.self sys.self replications
1.000 dt.M1 0.254 0.247 0.008 5
1.705 reduce 0.433 0.425 0.010 5
11.280 plyr 2.865 2.842 0.031 5
relative test elapsed user.self sys.self replications
1.000 dt.M1 24.826 10.427 14.458 5
4.348 reduce 107.935 70.107 38.314 5
5.942 plyr 147.508 106.958 41.083 5