你可以尝试以下任一方法:
使用 base R
aggregate(ClaimDay~day,FUN=length,data=mydata)
使用tapply
函数
as.data.frame(tapply(mydata$ClaimDay, mydata$day, length), responseName='ClaimDay')
通过 by
by(mydata$ClaimDay, mydata$day, length, simplify = TRUE)
使用 dplyr
library(dplyr)
mydata %>% count(day)
使用 data.table
library(data.table)
data.table(mydata)[,(ClaimDay=length(ClaimDay)),by=day]
使用
plyr
。
library(plyr)
ddply(mydata,~day,summarise,ClaimDay=length(day))
使用
sqldf
进行操作。
library(sqldf)
sqldf('select count(ClaimDay) as ClaimDay, day from mydata group by day')
并且基准测试结果:
library('microbenchmark')
microbenchmark(agg=aggregate(ClaimDay~day,FUN=length,data=mydata),
dplyr=mydata %>% dplyr:::count(day),
data.table=data.table(mydata)[,(ClaimDay=length(ClaimDay)),by=day],
plyr=ddply(mydata,~day,summarise,ClaimDay=length(day)),
tapply=as.data.frame(tapply(mydata$ClaimDay, mydata$day, length), responseName='ClaimDay'),
sqldf=sqldf('select count(ClaimDay) as ClaimDay, day from mydata group by day'),
by=by(mydata$ClaimDay, mydata$day, length, simplify = TRUE),
times=500)
Unit: microseconds
expr min lq mean median uq max neval cld
agg 1280.399 1408.2675 1655.8207 1458.9445 1845.331 7732.426 500 c
dplyr 1019.102 1177.3345 1350.3923 1220.0995 1356.736 3835.208 500 b
data.table 1690.092 1883.8190 2208.6055 1957.1630 2234.283 5493.653 500 d
plyr 2334.995 2482.7495 2847.0871 2554.5960 2944.404 6620.096 500 e
tapply 226.658 273.0580 342.0902 304.0635 353.244 2748.965 500 a
sqldf 8395.718 9057.0870 10458.0976 9440.2650 11389.515 61480.071 500 f
by 353.243 415.0395 492.2115 449.2520 509.765 4331.287 500 a