在R中计算两个数据表之间每行匹配元素的数量

Question

在R中计算两个数据表之间每行匹配元素的数量

9

在R中，我有两个数据框，我需要逐行计算元素匹配情况，最终得到一列，其中包含两个表的笛卡尔积长度和两行的ID。同时，这两个表都很大，行数不同，但列数相同。以下是我目前的代码，但运行多次时速度较慢。

library(data.table)

table_1<-data.table(matrix(c(1:24),nrow = 4))
table_2<-data.table(matrix(c(11:34),nrow = 4))

names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")

table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))

setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))

CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL] 

join<-CJ.table(table_1,table_2)

R<-subset(join, select=c("ID_ap","ID"))

R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+ 
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+ 
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+ 
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+ 
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+ 
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)

这提供了

   R
   ID_ap ID Ac
 1:     1  1  0
 2:     1  2  0
 3:     1  3  4
 4:     1  4  0
 5:     2  1  0
 6:     2  2  0
 7:     2  3  0
 8:     2  4  4
 9:     3  1  3
10:     3  2  0
11:     3  3  0
12:     3  4  0
13:     4  1  0
14:     4  2  3
15:     4  3  0
16:     4  4  0

- insu_liko

你的“数据框架”有哪些维度和包含什么值？ - alexis_laz

有大约10k行和100行的矩阵，填充着小的非零正整数。 - insu_liko

在一行中，值始终是不同的吗？ - Frank

1

是的，在一行内数值总是不同的，而且在矩阵中行也总是不同的。但是在两个矩阵之间可能存在相等的行。@Frank - insu_liko

1

关于您的CJ.table，您可能会对这个问题感兴趣：https://dev59.com/woPba4cB1Zd3GeqPsnh5 - Frank

5个回答

3

假设行数乘以两个表中唯一值的数量不是很大：

x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)

具有共同唯一值：

lvs = union(x1, x2)

并为每个表中的每一行中的每个唯一值列出其出现次数：

tab1 = matrix(tabulate(seq_len(nrow(table_1)) + (match(x1, lvs) - 1L) * nrow(table_1), 
                       nrow(table_1) * length(lvs)), 
              nrow(table_1), length(lvs))
tab2 = matrix(tabulate(seq_len(nrow(table_2)) + (match(x2, lvs) - 1L) * nrow(table_2), 
                       nrow(table_2) * length(lvs)), 
              nrow(table_2), length(lvs))

最后：

tcrossprod(tab1, tab2) #or 'tcrossprod(tab1 > 0L, tab2 > 0L)' to not count duplicate matches
#     [,1] [,2] [,3] [,4]
#[1,]    0    0    3    0
#[2,]    0    0    0    3
#[3,]    4    0    0    0
#[4,]    0    4    0    0

#and to change format (among different ways):
ans = tcrossprod(tab1, tab2)
cbind(c(row(ans)), c(col(ans)), c(ans))

如果 tab1 和 tab2 很大，它们可以被构建成稀疏矩阵，一种方法是：

library(Matrix)
stab1 = xtabs(rep_len(1L, length(x1)) ~ 
                    rep_len(seq_len(nrow(table_1)), length(x1)) 
                    + factor(match(x1, lvs), lvs), 
              sparse = TRUE)
stab2 = xtabs(rep_len(1L, length(x2)) ~ 
                    rep_len(seq_len(nrow(table_2)), length(x2)) 
                    + factor(match(x2, lvs), lvs), 
              sparse = TRUE)
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#  1 2 3 4
#1 . . 3 .
#2 . . . 3
#3 4 . . .
#4 . 4 . .

编辑

如果每一行都有(1)小的正整数值和(2)不同的值，可以避免使用match/unique/union来创建查找表和制表。

x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
nlvs = max(max(x1), max(x2))
stab1 = sparseMatrix(i = rep_len(seq_len(nrow(table_1)), length(x1)), 
                     j = x1, 
                     x = 1L, 
                     dims = c(nrow(table_1), nlvs))
stab2 = sparseMatrix(i = rep_len(seq_len(nrow(table_2)), length(x2)), 
                     j = x2, 
                     x = 1L, 
                     dims = c(nrow(table_2), nlvs))
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#            
#[1,] . . 3 .
#[2,] . . . 3
#[3,] 4 . . .
#[4,] . 4 . .

summary(tcrossprod(stab1, stab2))
#4 x 4 sparse Matrix of class "dgCMatrix", with 4 entries 
#  i j x
#1 3 1 4
#2 4 2 4
#3 1 3 3
#4 2 4 3

- alexis_laz

2

这篇文章中没有明确说明性能要求。但是，我已经创建了一个更大的可重现示例（如下），并且问题中的代码已经非常快了。

以下是如何在基本 R 中完成它：

t1 <- as.data.frame(table_1)
t2 <- as.data.frame(table_2)

system.time({
  ## compute all combinations of indices
  indices <- merge(t1[1], t2[1])

  ## create a mega df including all rows, cbinded together
  rows <- cbind(t1[indices[ ,"ID"], -1], t2[indices[ , "ID_ap"], -1])

  t1_cols <- names(rows) %in% names(t1)
  t2_cols <- names(rows) %in% names(t2)

  ## compute the counts; this step takes most of the time
  ## ~ 14 of the 18 second in this example
  counts <- apply(rows, 1, function(r) sum(r[t1_cols] %in% r[t2_cols]))
})
out <- data.frame(indices, Ac=counts)

例如，对于下面的大规模可重现问题（dim(out) == c(1e6, 3)），以上代码在不到20秒的时间内运行。

   user  system elapsed
 17.879   0.348  18.245

编辑：大规模可重现的问题：

library(data.table)
NROW <- 1e4
NROW2 <- 1e2
table_1<-data.table(matrix(c(1:24),nrow = NROW, ncol=6))
table_2<-data.table(matrix(c(11:34),nrow = NROW2, ncol=6))

names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")

table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))

setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))

这位楼主的解决方案比这个答案跑得快得多

CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL] 

join<-CJ.table(table_1,table_2)

R<-subset(join, select=c("ID_ap","ID"))

system.time({
   R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
  (join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
  (join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
  (join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
  (join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
  (join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
})
#    user  system elapsed
# 0.295   0.044   0.339

但是弗兰克的答案提供的解决方案仍然更快。

setnames(table_2, "ID_ap", "ID")
tabs = rbind(
               melt(table_1, id="ID")[, variable := NULL],
                 melt(table_2, id="ID")[, variable := NULL],
                 idcol = TRUE)

system.time({out3 <- tabs[,
        if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
        , by=value][,
           .N
        , by=.(ID1, ID2)]})
#   user  system elapsed
#  0.109   0.013   0.122

- jaimedash

1

感谢您发布了一个基准测试。我希望有人会这样做。我很想知道其他人是否表现良好。很难知道在 system.time() 调用中包含/排除什么。我认为我的 tabs 创建应该放在外面（就像您在这里所做的那样），因为数据应该从一开始就以这种方式存储。但是 OP 的 join 和 R 的创建应该在调用内部，因为它们只是这个计算过程的一部分（但它们在这里是在调用外部）。不过这相当主观。 - Frank

谢谢。我对你的回答印象深刻，现在想学习一些data.table。我也同意主观性。 - jaimedash

2

如果我们将OP的join和R放在system.time中，用户时间会增加到0.590。我也尝试了@eddi的colSums行，它几乎和我的一样慢（user = 10.581）。 - jaimedash

2

如何考虑：

colSums(apply(join[, !c("ID", "ID_ap"), with = F], 1, duplicated))
#[1] 0 0 4 0 0 0 0 4 3 0 0 0 0 3 0 0

或者，从头开始：

或者，重新开始：

setkey(table_1, ID)
setkey(table_2, ID_ap)

ids = CJ(ID1 = table_1$ID, ID2 = table_2$ID_ap)
ids[, sum(duplicated(c(table_1[.(ID1), !'ID', with = F],
                       table_2[.(ID2), !'ID_ap', with = F])))
    , by = .(ID1, ID2)]
#    ID1 ID2 V1
# 1:   1   1  0
# 2:   1   2  0
# 3:   1   3  3
# 4:   1   4  0
# 5:   2   1  0
# 6:   2   2  0
# 7:   2   3  0
# 8:   2   4  3
# 9:   3   1  4
#10:   3   2  0
#11:   3   3  0
#12:   3   4  0
#13:   4   1  0
#14:   4   2  4
#15:   4   3  0
#16:   4   4  0

- eddi

1

这是一个可能性：

这里是一个段落。

> t1<-data.frame(matrix(c(1:24),nrow = 4))
> t2<-data.frame(matrix(c(11:34),nrow = 4))
> ret<-expand.grid(r1=1:nrow(t1),r2=1:nrow(t2))
> ret$matches<-apply(ret,1,function(a)sum(t1[a[1],] %in% t2[a[2],]))
> ret
   r1 r2 matches
1   1  1       0
2   2  1       0
3   3  1       4
4   4  1       0
5   1  2       0
6   2  2       0
7   3  2       0
8   4  2       4
9   1  3       3
10  2  3       0
11  3  3       0
12  4  3       0
13  1  4       0
14  2  4       3
15  3  4       0
16  4  4       0

- mrip

谢谢@mrip，对于小矩阵来说运行得很好，但是当使用像10k行这样的大矩阵时，它比原始代码花费更长的时间，我会深入研究一下。 - insu_liko

1

如果你有一个大数据集并且需要速度，我建议 (1) 不要使用 data.frame 或 data.table，而是使用矩阵；(2) 使用 Rcpp 编写更高效的匹配器。 - mrip

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Frank · Accepted Answer

将数据放入长格式，因为列的顺序不重要：

setnames(table_2, "ID_ap", "ID")
tabs = rbind(
  melt(table_1, id="ID")[, variable := NULL],
  melt(table_2, id="ID")[, variable := NULL],
  idcol = TRUE)

（1）针对每个数值，确定相关的配对；

（2）对于配对，计算数值：

tabs[, 
  if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
   .N
, by=.(ID1, ID2)]


   ID1 ID2 N
1:   3   1 4
2:   4   2 4
3:   1   3 3
4:   2   4 3

我认为，所有其他的(ID1, ID2)组合都是零，不需要明确列出。

如果值在每个表中都是唯一的，就像OP的例子一样，那么我们可以简化：

tabs[, if (.N==2L) .(ID1 = ID[1L], ID2 = ID[2L]), by=value][, .N, by=.(ID1, ID2)]