基于不同变量的R交互式连接

3
我有两个如下所示的数据框:

 df<-data.frame(
  id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product=c("productA","productB","productC","productD","productE","productF"),
  ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))

  df_2<-data.frame(
  identifer=c(1,2,2,3,4,6),
  key=c("A","B","B","C","D","F"),
  product=c("productA","productB","productB","productCC","productDD","productFF"),
  ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
  DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
  Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)

我希望首先将这两个数据集按以下变量进行连接,并创建一个名为“match”的新列来描述连接的结果:
1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"

然后,我想在这些变量上连接剩余的行:

2)identifier, key, product, DF
match="identifier,key, product,DF"

然后在这些变量上执行从步骤2开始的剩余行,依此类推。

3) identifier, key, Ingredient, DF
4) identifier, key, DF 
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient 
9) identifier, key 

我希望返回没有匹配的行。虽然我知道如何逐步完成这个过程,但我想知道是否有更简便的方法?

以下是期望的输出结果:

df_out<-data.frame(
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product_1=c("productA","productB","productC","productD","productE","productF"),
  ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
  product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
  ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
  DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
  Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
  Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))

1
嗨,Mel,你能提供一下你期望的输出吗? - Ian Campbell
嗨@IanCampbell,已添加。 - Mel
2个回答

3

这里提供了一种使用data.table的选项:

library(data.table)
setDT(df)
setDT(df_2)

keyord <- list(
    c("product", "ingredient", "DF"),
    c("product", "DF"),
    c("ingredient", "DF"),
    "DF",
    c("product", "ingredient"),
    "product",
    "ingredient",
    c()
)

cols <- c("product", "ingredient", "DF", "Route")
df[, Match := NA_character_]

for (v in keyord) {
    k <- c("identifier", "key", v)
    df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))]
    df[is.na(Match) & check, Match := toString(k)]
}
setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE)

输出:

    id identifier key product_1 ingredient_1       DF_1                                    Match product_2 ingredient_2       DF_2     Route_2 check
1: 1-1          1   A  productA  ingredientA     Tablet identifier, key, product, ingredient, DF  productA  ingredientA     Tablet        ORAL  TRUE
2: 2-2          2   B  productB  ingredientB     Powder identifier, key, product, ingredient, DF  productB  ingredientB     Powder  INHALATION  TRUE
3: 3-3          3   C  productC  ingredientC Suspension          identifier, key, ingredient, DF productCC  ingredientC Suspension     topical  TRUE
4: 4-4          4   D  productD  ingredientD     System                          identifier, key productDD ingredientDD  injection injecatable  TRUE
5: 5-5          5   E  productE  ingredientE    Capsule                                     <NA>      <NA>         <NA>       <NA>        <NA>    NA
6: 6-6          6   F  productF  ingredientF    Capsule     identifier, key, product, ingredient  productF  ingredientF     tablet        oral  TRUE

修正OP中某些错别字后的数据:

df <- data.frame(
    id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
    identifier=c(1,2,3,4,5,6),
    key=c("A","B","C","D","E","F"),
    product=c("productA","productB","productC","productD","productE","productF"),
    ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
    DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))

df_2 <- data.frame(
    identifier=c(1,2,2,3,4,6),
    key=c("A","B","B","C","D","F"),
    product=c("productA","productB","productB","productCC","productDD","productF"),
    ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"),
    DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
    Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)

编辑多个匹配项:

df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") )
setDT(df_2)
df[, c("Match", "check") := .(NA_character_, FALSE)]

ocols <- unique(unlist(keyord))
rbindlist(lapply(keyord, function(v) {
    k <- c("identifier", "key", v)
    a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id),
        setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")), 
        setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2"))) 
    ]
    df[id %chin% a$id, check := TRUE]
    a
}), use.names=TRUE)

输出:

    id product_1 ingredient_1       DF_1 product_2 ingredient_2       DF_2     Route_2
1: 1-1  productA  ingredientA     Tablet  productA  ingredientA     Tablet        ORAL
2: 2-2  productB  ingredientB     Powder  productB  ingredientB     Powder  INHALATION
3: 3-3  productC  ingredientC Suspension productCC  ingredientC Suspension     topical
4: 6-6  productF  ingredientF    Capsule  productF  ingredientF     tablet        oral
5: 4-4  productD  ingredientD     System productDD ingredientDD  injection injecatable
6: 4-4  productD  ingredientD     System productDd         <NA>       <NA>        <NA>

你能分享一个这样情况的示例和你期望的输出吗? - chinsoon12
我无法编辑问题,但df_2可以像这样:df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("产品A","产品B","产品B","产品CC","产品DD","产品Dd","产品F"), ingredient=c("成分A","成分BB","成分B","成分C","成分DD",NA,"成分F"), DF=c("片剂","粉末","粉末","悬液","注射剂",NA,"片剂"), Route=c("口服","吸入","吸入","局部用药","注射",NA,"口服") ) - Mel
这种情况的期望输出是什么?先前的键分组中也有多个匹配项。 - chinsoon12
如果最高匹配是标识符+关键字,例如标识符4-4,那么可能会有多行匹配到该行的标识符+关键字的最高匹配。因此,我想能够获取所有匹配到最高匹配的行,而不仅仅是一行。这样说清楚了吗? - Mel
@Mel,我添加了一个版本来解决这个问题。 - chinsoon12
显示剩余2条评论

0

这里有一个解决方案,可能会感觉有点过度设计,但可以实现预期的结果:

library(dplyr)
library(purrr)
library(stringr)

get_match=function(data, cols, keys){
  rtn = ifelse(rowSums(is.na(data[paste0(cols, "_1")]))==rowSums(is.na(data[paste0(cols, "_2")])), paste(keys, collapse="+"), "None")

  rtn2 = cols %>% 
    map(~{
      case_when(as.character(data[[paste0(.x, "_1")]])==as.character(data[[paste0(.x, "_2")]])~.x)
    }) %>% 
    reduce(paste, sep="+") %>% str_replace_all("\\+?NA\\+?", "")

  paste(rtn, rtn2, sep="+") %>% str_replace_all("\\+$", "")
}

df_out = left_join(df, df_2, by=c("identifer", "key"), suffix=c("_1", "_2")) %>% 
    mutate(Match = get_match(., cols=c("product", "ingredient", "DF"), keys=c("identifer", "key")), 
           match_strength = str_count(Match, "\\+")) %>% 
    group_by(id) %>% 
    filter(match_strength==max(match_strength, na.rm=TRUE))

dplyr::left_join 移除 by 键,所以我找到的唯一添加它们的方法是检查所有的 _1_2 是否缺失。虽然我可以使用 keep=TRUE 选项,并在此之后删除/重命名它们...


嗨Mel。你更新的数据集似乎有误,因为标识列中不应该有任何重复项。但是,如果“key”不是标识符,则可以从“by =”和“keys =”中将其删除。 - Dan Chaltiel
第二个数据集中通常会有重复项-这就是我需要创建匹配层次结构的原因。我的目标是为df每个id只有一个匹配项。我还添加了一个id列。 - Mel
@Mel,我缺少一段逻辑:是什么规则让您得出结论df_out[2,"ingredient_2"]是“ingredientB”,而不是“ingredientBB”? - Dan Chaltiel
我在df_out[2,ingredient_2]上有两个匹配项,其中一个行匹配于identifer+key+product+DF,这意味着成分不同(成分BB),另一个行匹配于identifer+key+product+ ingredient+DF。由于更多的字段匹配,我想保留后者作为我的最终数据集中的匹配项。我想基于我提供的数字进行连接。如果我得到第一个连接,那么我将删除该行,以此类推。 - Mel
@Mel 好的,那么你需要测量匹配的强度,然后只保留最强的一个。我已经编辑了我的答案以达到这个目的。我通过计算Match中的+来测量强度,但可能有更有效的方法。 - Dan Chaltiel
显示剩余3条评论

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接