@thelatemail目前拥有更快的方法:
setkey(dt1, code)
setkey(dt2, code)
dt1[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]
我想您需要进行连接操作,并设置键可以加快速度:
setkey(dt1, code)
setkey(dt2, code)
existing <- dt2[dt1, on=.(code), nomatch=0L]
dt1[, in_dt2 := dt1row %in% existing$dt1row]
另一个来自基本R的选项是使用base::match
m0 <- function() {
DT10$in_dt2 <- match(DT10$code, dt2$code, nomatch=0L) > 0L
DT10
}
m1 <- function() {
setkey(DT11, code)
existing <- dt2[DT11, on=.(code), nomatch=0L, mult="first"]
DT11[, in_dt2 := dt1row %in% existing$dt1row]
}
m2 <- function() {
DT12[, in_dt2 := match(code, dt2$code, nomatch=0L) > 0L]
}
m_thelatemail <- function() {
setkey(DT13, code)
DT13[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]
}
bench::mark(m0(), m1(), m2(), m_thelatemail(), check=FALSE)
identical(DT11[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT12[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT13[order(dt1row), in_dt2], m0()$in_dt2)
时序:
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 m0() 914ms 914ms 1.09 38.3MB 1.09 1 1 914ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [1 x 3]>
2 m1() 252ms 273ms 3.66 36.8MB 1.83 2 1 547ms <df[,3] [1,000,000 x 3]> <df[,3] [33 x 3]> <bch:tm> <tibble [2 x 3]>
3 m2() 198ms 252ms 4.14 23.1MB 2.76 3 2 724ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [3 x 3]>
4 m_thelatemail() 148ms 158ms 6.38 15.4MB 0 4 0 627ms <df[,3] [1,000,000 x 3]> <df[,3] [28 x 3]> <bch:tm> <tibble [4 x 3]>
< p > < code > m0() 的输出:
dt1row code in_dt2
1: 1 nydga FALSE
2: 2 bwknr FALSE
3: 3 sauxj FALSE
4: 4 vnjgi FALSE
5: 5 ouein FALSE
999996: 999996 wiucs FALSE
999997: 999997 yqjrp FALSE
999998: 999998 elort FALSE
999999: 999999 asjyh FALSE
1000000: 1000000 lmbjw FALSE
数据:
library(data.table)
set.seed(0L)
nr <- 1e6
dt1 <- data.table(dt1row=c(1:nr),code=sapply(c(1:nr),FUN=function(x) paste(sample(letters,5), collapse="")))
dt2 <- data.table(dt2row=c(1:(nr/2)),code=sapply(c(1:(nr/2)),FUN=function(x) paste(sample(letters,5), collapse="")))
DT10 <- copy(dt1)
DT11 <- copy(dt1)
DT12 <- copy(dt1)
DT13 <- copy(dt1)
setkey(dt2, code)
一个建议是,在使用sample
随机生成数据时,set.seed
可以被设置。
dt1[, in_dt2 := FALSE];dt1[dt2, on=.(code), in_dt2 := TRUE]
- thelatemailsetkey(dt1, code)
在我的机器上需要大约6秒钟的时间。所以m2()
可能会非常好(或者是一个轻微的替代方案,它主要是糖dt1[, in_dt2 := code %in% dt2$code]
)。 - Cole