这是一个使用
data.table
1.8.7 中的
fread
的示例。
这些示例来自于
fread
的帮助页面,在我的 Windows XP Core 2 Duo E8400 上进行了时间测试。
library(data.table)
n=1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(1:1000,n,replace=TRUE),
c=rnorm(n),
d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
e=rnorm(n),
f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]
标准 read.table
write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
cat("File size (MB):",round(file.info("test.csv")$size/1024^2),"\n")
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))
优化的 read.table
system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="",
stringsAsFactors=FALSE,comment.char="",nrows=n,
colClasses=c("integer","integer","numeric",
"character","numeric","integer")))
fread
require(data.table)
system.time(DT <- fread("test.csv"))
## user system elapsed
## 3.12 0.01 3.22
sqldf
require(sqldf)
system.time(SQLDF <- read.csv.sql("test.csv",dbname=NULL))
f <- file("test.csv")
system.time(SQLf <- sqldf("select * from f", dbname = tempfile(), file.format = list(header = T, row.names = F)))
ff / ffdf
require(ff)
system.time(FFDF <- read.csv.ffdf(file="test.csv",nrows=n))
## user system elapsed
## 10.85 0.10 10.99
总之:
feather
的读取速度更快,但它使用了更多的存储空间。 - Z boson