高效地在查找表中找到向量的所有匹配项，包括重复的。

Question

高效地在查找表中找到向量的所有匹配项，包括重复的。

12

我想要找到一个向量 x 在另一个查找向量 table 中所有匹配项的索引。

table = rep(1:5, each=3)
x = c(2, 5, 2, 6)

标准的基础R方法并不能完全满足我的需求。例如，使用which(table %in% x)，我们只能得到匹配的索引一次，即使x中出现了两次2。

which(table %in% x)
# [1]  4  5  6 13 14 15

另一方面，match 返回每个与 x 匹配的值，但只返回查找表中的第一个索引。

match(x, table)
# [1]  4 13  4 NA

我想要的是一个返回“所有x和所有y”的索引的函数。也就是说，它应该返回以下期望的结果：

mymatch(x, table)
# c(4, 5, 6, 13, 14, 15, 4, 5, 6)

我们当然可以用R中的循环来实现这个。

mymatch = function(x, table) {
  matches = sapply(x, \(xx) which(table %in% xx)) 
  unlist(matches)
}

mymatch(x, table)
# [1]  4  5  6 13 14 15  4  5  6

但是在大数据上这个速度太慢了（我需要在大数据上多次执行这个操作）

table = rep(1:1e5, each=10)
x = sample(1:100, 1000, replace = TRUE)
system.time(mymatch(x, table))
#  user  system elapsed 
# 3.279   2.881   6.157

这个速度非常慢，如果我们与which %in%进行比较的话。

system.time(which(table %in% x))
#  user  system elapsed 
# 0.003   0.004   0.008

希望在R中有一种快速的方法来做这个？否则，也许Rcpp是一个可行的选择。

- dww

当我在更大的样本数据上运行mymatch时，我得到一个10x1000的矩阵，这是期望的输出结构吗？ - undefined

@JonSpring - 不，这并不是故意的。你发现得很好。期望的输出格式是一个简单的向量。 - undefined

匹配的值是否总是整数？ - undefined

此外，如果您多次运行它，您是否会使用不同的向量重复使用同一张表格？如果是的话，几乎可以肯定地将其处理成一个中间对象，以消除大部分重复工作。 - undefined

@anjama 是的，整数，而且表在迭代之间保持不变。 - undefined

显示剩余3条评论

6个回答

9

也许data.table是一个选择？如果你有相对较大的表/向量，你可能会看到速度上的提升，尤其是如果你采用类似Jon Spring的“join”方法。

library(tidyverse)
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
#>     yday, year
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
library(microbenchmark)

onyambu_faster <- function(x, table){
  a <- rle(table)
  n <- length(a$lengths)
  idx <- match(x, a$values, 0)
  sequence(a$lengths[idx], cumsum(c(1,a$lengths[-n]))[idx])
}

jon_spring_join_match = function(x, table) {
  t <- data.frame(table, index = 1:length(table))
  xt <- data.frame(x, index = 1:length(x))
  t |>
    left_join(xt, join_by(table == x), relationship = 'many-to-many') |>
    arrange(index.y) %>%
    filter(!is.na(index.y)) %>%
    pull(index.x)
}

jared_mamrot_dt <- function(x, table){
  table_dt <- data.table(table, index = 1:length(table))
  x_dt <- data.table(x, index = 1:length(x))
  return(na.omit(table_dt[x_dt, on = .(table == x)][,index]))
}

table = rep(1:1e5, each=10)
x = sample(1:100, 1000, replace = TRUE)

all.equal(onyambu_faster(x, table), jared_mamrot_dt(x, table))
#> [1] TRUE
all.equal(jon_spring_join_match(x, table), jared_mamrot_dt(x, table))
#> [1] TRUE

res <- microbenchmark(onyambu_faster(x, table),
                      jon_spring_join_match(x, table),
                      jared_mamrot_dt(x, table),
                      times = 10)
res
#> Unit: milliseconds
#>                             expr       min       lq      mean   median
#>         onyambu_faster(x, table) 38.196317 45.08884  65.22651 52.40748
#>  jon_spring_join_match(x, table) 48.697968 74.54407 105.79551 83.11473
#>        jared_mamrot_dt(x, table)  9.441176 11.34315  12.99648 11.76324
#>         uq       max neval cld
#>   64.88688 129.38505    10 a  
#>  131.50681 221.16477    10  b 
#>   14.05289  21.84779    10   c
autoplot(res)

^{2023-10-26创建，使用reprex v2.0.2}

- jared_mamrot

现在我很好奇，DuckDB、Arrow或者Collapse可能会更快一些... - undefined

9

应该使用连接操作更快。这个速度比之前快100倍以上。

library(dplyr)
join_match = function(x, table) {
  t <- data.frame(table, index = 1:length(table))
  xt <- data.frame(x, index = 1:length(x))
  t |>
    left_join(xt, join_by(table == x), relationship = 'many-to-many') |>
    arrange(index.y) %>%
    filter(!is.na(index.y)) %>%
    pull(index.x)
}

同样的输出，速度快100-200倍，比@Onyambu提出的基本R建议快大约3倍（注意：该方法已经更新为类似速度，而使用data.table解决方案甚至更快。使用duckdb、arrow或collapse进行连接可能会更快。但我的观察是，通过将其视为连接，您可以获得显著的速度提升+可读性）：

set.seed(42)
table = rep(1:1e5, each=10)
x = sample(1:100, 1000, replace = TRUE)
bench::mark(
  mymatch(x, table) |> as.vector(),
  join_match(x, table),
  unlist(split(seq(table), table)[as.character(x)],use.names = FALSE),
  check = TRUE
)

# A tibble: 3 × 13
  expression                                                                min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result
  <bch:expr>                                                           <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>
1 as.vector(mymatch(x, table))                                            13.8s    13.8s    0.0727    14.9GB     2.83     1    39      13.8s <int> 
2 join_match(x, table)                                                   48.7ms   62.2ms   13.8       88.8MB     3.95     7     2    506.3ms <int> 
3 unlist(split(seq(table), table)[as.character(x)], use.names = FALSE)  183.6ms  184.5ms    5.31      29.8MB     0        3     0    564.9ms <int>

- Jon Spring

mymatch正在生成一个矩阵。如果可以将其压缩成一个向量，输出结果将与我的函数匹配。 - undefined

6

根据问题中的数据，根据我的机器上的中位数时间，这个速度是原来的两倍快。

table = rep(1:5, each=3)
x = c(2, 5, 2, 6)

mymatch = function(x, table) {
  matches = sapply(x, \(xx) which(table %in% xx)) 
  unlist(matches)
}

outer_match <- function(x, table) {
  z1 <- outer(table, x, "==") 
  z2 <- z1 * row(z1)
  z2[z2 != 0]
}

outer_match(x, table)
## [1]  4  5  6 13 14 15  4  5  6

library(microbenchmark)
microbenchmark(
 mymatch(x, table),
 outer_match(x, table)
)
## Unit: microseconds
##                   expr  min    lq    mean median   uq    max neval cld
##      mymatch(x, table) 77.0 79.15 166.696  82.75 84.3 8384.9   100   a
##  outer_match(x, table) 35.1 36.75 115.783  41.95 43.1 7410.1   100   a

- G. Grothendieck

3

你可以简单地运行outer + row（短代码但可能不太高效，因为outer），例如，

> row(d <- outer(table, x, `==`))[d]
[1]  4  5  6 13 14 15  4  5  6

- ThomasIsCoding

0

如果要匹配的值是整数，那么你可以将它们用作包含所需索引值的列表的索引值（只要最大整数不会导致列表超出你的内存容量）。

# Process the table vector every time
anjama_list <- function(x, table) {
  l = vector("list", max(table))
  
  i = 0
  for (val in table) {
    i = i + 1
    l[[val]] = c(l[[val]], i)
  }
  
  return(unlist(l[x]))
}

现在，这个解决方案并不像其他在这里提到的那么快，但是由于您正在重复使用表格进行多次查找，我们可以预先计算列表的创建并在迭代中重复使用它。

# If the table vector is being reused, only need to process it once
l = vector("list", max(table))

i = 0
for (val in table) {
  i = i + 1
  l[[val]] = c(l[[val]], i)
}

# Now that the list is created, we can do as many lookups as we want without that cost
anjama_list_cache <- function(x, l) {
  return(unlist(l[x]))
}

原来列表查找和取消列表的部分是非常便宜的。

使用jared_mamrot的代码创建的图表。

所以，这取决于重复使用相同的表向量多少次来弥补初始设置。就内存使用而言，我认为查找和取消列表也应该非常高效（查找基本上是没有的，取消列表与您的x向量的大小有关），但我还没有尝试过对它们进行分析。

- anjama

@Onyambu 这些输入产生了与您的函数相同的结果。l[2] 返回 NULL，但使用 unlist() 时会被忽略。 - undefined

你一定是做了错误的比较。你给出的代码比原来的慢了大约10倍。每次增加列表的操作对R来说是相当耗费资源的。或许考虑在C++中完成相同的操作，然后在R中调用该函数会更高效。但这种方法并不高效。 - undefined

@Onyambu 是的，每次创建列表的速度较慢，如图中的第二行所示。但是OP已经指出查找向量是被重复使用的，这意味着列表只需要在多次运行中创建一次。图中的第一行显示了列表创建后的性能，大约快了100倍。因此，只要列表被重复使用足够多次，创建它的初始成本就会被抵消。 - undefined

如果是这样的话，那你仍然在进行错误的比较。请注意，如果表格只创建一次，那么我提供的函数中的 rle 和 cumsum 应该在比较之外完成，而不是在比较之内，就像你创建表格一样在比较之外完成一样。 - undefined

1

我只是指出你只比较了一半。也就是说，如果将相同的参数应用于其他部分，你的解决方案将会比较慢。你的解决方案没问题，只是比较给人一种错误的快速印象。 - undefined

显示剩余4条评论

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Onyambu · Accepted Answer

另一种方法是使用split函数：

unlist(split(seq(table), table)[as.character(x)],use.names = FALSE)
[1]  4  5  6 13 14 15  4  5  6

编辑：

请注意，如果表格已经排序，那么你可以使用rle + sequence方法：

faster <- function(x, table){
  a <- rle(table)
  n <- length(a$lengths)
  idx <- match(x, a$values, 0)
  sequence(a$lengths[idx], cumsum(c(1,a$lengths[-n]))[idx])
}

set.seed(42)
table = rep(1:1e5, each=10)
x = sample(1:100, 1000, replace = TRUE)
bench::mark(
   faster(x, table),
   #mymatch(x, table) |> as.vector(),
   join_match(x, table),
   #unlist(split(seq(table), table)[as.character(x)],use.names = FALSE),
   check = TRUE
 )


# A tibble: 2 × 13
  expression     min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 faster(x,…  54.4ms  252ms      3.97    54.9MB     1.99     2     1      503ms <int>  <Rprofmem>
2 join_matc… 127.7ms  254ms      3.93    88.8MB     5.90     2     3      508ms <int>  <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

只要表格排序，该函数就能正常工作。不一定要从1到n。

table = c(rep(1:5, each=3), 7,7,7,7,10,10)
x = c(10, 2, 5,7, 2, 6)

microbenchmark::microbenchmark(
   faster(x, table),
   #mymatch(x, table) |> as.vector(),
   join_match(x, table),
   #unlist(split(seq(table), table)[as.character(x)],use.names = FALSE),
   check = 'equal'
 )
Unit: microseconds
                 expr      min       lq       mean   median       uq       max neval
     faster(x, table)   23.001   32.751   56.95703   56.400   66.201   222.901   100
 join_match(x, table) 4216.201 4925.302 6616.51401 5572.951 7842.200 21153.402   100