从不同的数据框中抽取样本

4
 data1=data.frame("Group1" = sample(1:2,100,r=T),
                  "Group2" = sample(c('a','b'),100,r=T),
                  "V1" = sample(1:3, 100, r=T),
                  "V2" = sample(0:1, 100, r=T),
                  "V3" = sample(1:5, 100, r=T),
                  "V4" = sample(1:2, 100, r=T))


data2=data.frame("Group1"=c(1,1,2,2),
                  "Group2"=c('a','b','a','b'),
                  "Size"=c(9,7,6,10),
                  "V1"=c(NA),
                  "V2"=c(NA),
                  "V3"=c(NA),
                  "V4"=c(NA))

我有一个名为"data1"的数据,还有一个名为"data2"的数据,其中包含"Group1"、"Group2"和"Size"。我希望按照"Group1"和"Group2"进行分组,并从"data1"中随机抽取大小为"Size"的样本填充"data2"中的V1-V4。


希望的输出结果应该类似于以下内容,但NA值应基于"data1"填充:

library(dplyr);library(tidyr)
data3= data2 %>% 
  uncount(Size)
2个回答

6
library(data.table)
setDT(data1)
setDT(data2)

# sample indices from each group
i <- 
  data2[data1, on = .(Group1, Group2)
      ][, .(i_samp = sample(.I, Size)), by = .(Group1, Group2, Size)
      ][, i_samp]

# subset to sampled indices
merge(data1[i], data2[, .(Group1, Group2, Size)])

#     Group1 Group2 V1 V2 V3 V4 Size
#  1:      1      a  3  1  2  2    9
#  2:      1      a  3  1  5  1    9
#  3:      1      a  2  1  4  2    9
#  4:      1      a  3  1  1  1    9
#  5:      1      a  3  1  4  1    9
#  6:      1      a  1  0  3  1    9
#  7:      1      a  3  1  1  1    9
#  8:      1      a  1  1  1  2    9
#  9:      1      a  2  0  2  1    9
# 10:      1      b  2  0  5  2    7
# 11:      1      b  3  0  5  2    7
# 12:      1      b  3  1  4  2    7
# 13:      1      b  1  1  1  1    7
# 14:      1      b  1  1  4  1    7
# 15:      1      b  1  0  1  1    7
# 16:      1      b  1  0  3  1    7
# 17:      2      a  2  0  5  1    6
# 18:      2      a  1  0  5  1    6
# 19:      2      a  3  1  1  2    6
# 20:      2      a  1  0  2  1    6
# 21:      2      a  3  1  1  2    6
# 22:      2      a  1  1  3  2    6
# 23:      2      b  3  0  2  1   10
# 24:      2      b  2  1  5  1   10
# 25:      2      b  3  0  1  1   10
# 26:      2      b  3  1  2  1   10
# 27:      2      b  2  0  5  1   10
# 28:      2      b  2  0  2  1   10
# 29:      2      b  2  0  2  2   10
# 30:      2      b  1  0  1  1   10
# 31:      2      b  3  0  5  1   10
# 32:      2      b  3  0  5  1   10
#     Group1 Group2 V1 V2 V3 V4 Size

使用的输入数据:

data1=data.frame("Group1" = sample(1:2,100,r=T),
                  "Group2" = sample(c('a','b'),100,r=T),
                  "V1" = sample(1:3, 100, r=T),
                  "V2" = sample(0:1, 100, r=T),
                  "V3" = sample(1:5, 100, r=T),
                  "V4" = sample(1:2, 100, r=T))


data2=data.frame("Group1"=c(1,1,2,2),
                  "Group2"=c('a','b','a','b'),
                  "Size"=c(9,7,6,10),
                  "V1"=c(NA),
                  "V2"=c(NA),
                  "V3"=c(NA),
                  "V4"=c(NA))

下面是一个更具参数化的版本,您可以明确设置要填充的列以及连接两个表的键

fill_key <- c('Group1', 'Group2')
columns_to_fill <- paste0('V', 1:4)

# sample indices from each group
i <- 
  data2[data1, on = (fill_key)
      ][, .(i_samp = sample(.I, Size)), by = c(fill_key, 'Size')
      ][, i_samp]


# subset to sampled indices
merge(data1[i, c(fill_key, columns_to_fill), with = FALSE], 
      data2[, c(fill_key, 'Size'), with = FALSE])

1
据我所知,这里的输出与您所描述的相同。 - IceCreamToucan
谢谢,我更新了问题以帮助澄清。 - bvowe
1
根据新数据进行了更新。 - IceCreamToucan
1
V1实际上与您的列名称无关,这只是data.table为样本索引列提供的默认名称,因为我没有给该列命名。为了避免混淆使用V1,我进行了重写。我还添加了一种设置填充列的版本。 - IceCreamToucan
1
如果在运行代码之前不将数据转换为data.table,就会发生这种情况。将数据转换为data.table的方法是运行setDT(yourdata) - IceCreamToucan
显示剩余6条评论

1
一个 dplyr 的选项可能是:
data1 %>%
 left_join(data2 %>%
            select(-starts_with("V"))) %>%
 group_by(Group1, Group2) %>%
 sample_n(Size) 

   Group1 Group2    V1    V2    V3    V4  Size
    <dbl> <fct>  <int> <int> <int> <int> <dbl>
 1      1 a          1     1     1     2     9
 2      1 a          3     0     3     2     9
 3      1 a          2     0     3     2     9
 4      1 a          1     1     2     1     9
 5      1 a          2     1     2     2     9
 6      1 a          2     0     5     2     9
 7      1 a          1     0     1     2     9
 8      1 a          3     0     5     2     9
 9      1 a          1     0     5     1     9
10      1 b          2     0     1     1     7
11      1 b          2     1     3     1     7
12      1 b          3     1     4     2     7
13      1 b          1     1     1     1     7
14      1 b          2     1     2     2     7
15      1 b          1     1     1     2     7
16      1 b          1     1     2     1     7
17      2 a          3     1     5     1     6
18      2 a          1     0     5     1     6
19      2 a          1     0     1     1     6
20      2 a          2     0     5     1     6
21      2 a          3     0     1     1     6
22      2 a          2     1     4     1     6
23      2 b          3     0     2     1    10
24      2 b          1     1     5     1    10
25      2 b          3     1     1     1    10
26      2 b          3     1     4     1    10
27      2 b          1     0     4     2    10
28      2 b          3     1     1     2    10
29      2 b          2     1     4     1    10
30      2 b          1     0     1     1    10
31      2 b          2     1     4     2    10
32      2 b          2     1     5     2    10

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接