使用pivot longer来避免硬编码,以一次性旋转多列。

3
我想使用 pivot_long() 函数将下面的数据集的多列进行长格式处理,避免硬编码。我看到了一些类似的问题,但仍然无法做到。
宽型数据:
> head(data)
      ID IND_TEST_SCORE ARG_G1_ABC NARR_G1_ABC ARG_G1_EF NARR_G1_EF ARG_G2_ABC NARR_G2_ABC
1 PART_1            100      68.53       71.32      4.94       3.42      64.90       64.25
2 PART_2             36      65.90          NA      6.55         NA      63.80       59.00
3 PART_3             32      69.78          NA      2.44         NA      71.73          NA
4 PART_4             96      68.29       67.83      3.00       3.17      67.67       67.88
5 PART_5             11        NaN         NaN       NaN        NaN         NA       67.08
6 PART_6             12      69.50       71.60      3.25       2.50         NA          NA
  ARG_G2_EF NARR_G2_EF
1      7.10       5.08
2      7.40       7.00
3      1.09         NA
4      3.67       1.76
5        NA       3.00
6        NA         NA

期望的输出:

ID     IND_TEST_SCORE       ABC    EF   GROUP  TYPE
1 PART_1            100                   G1   ARG
1 PART_1            100                   G1   NARR
1 PART_1            100                   G2   ARG
1 PART_1            100                   G2   NARR
2 PART_2             36                   G1   ARG
2 PART_2             36                   G1   NARR
2 PART_2             36                   G2   ARG
2 PART_2             36                   G2   NARR
so on...

问题:如何:

  1. 创建名为“GROUP”的新列,其中包含值“G1”和“G2”
  2. 创建名为“TYPE”的新列,其中包含值“ARG”和“NARR”
  3. 创建2个新列,一个用于“ABC”值,另一个用于“EF”值

而不进行硬编码?我想使用模式...提前感谢!

我到目前为止的尝试:

# create a single "my_names" columns and work on it:

dataLong <- data %>%  
  pivot_longer(cols = c(-ID, -IND_TEST_SCORE), 
               names_to = "my_names",
               values_to = "my_values") %>% 
  mutate(GROUP = case_when(my_names == "ARG_G1_ABC" ~ "G1",
                              my_names == "ARG_G1_ABC" ~ "G2",
                              my_names == "ARG_G1_EF" ~  "G1",
                              my_names == "ARG_G2_EF" ~  "G2",
                              my_names == "NARR_G1_ABC" ~ "G1",
                              my_names == "NARR_G1_ABC" ~ "G2",
                              my_names == "NARR_G1_EF" ~  "G1",
                              my_names == "NARR_G2_EF" ~  "G2")) %>% 
  mutate(TYPE = case_when(my_names == "ARG_G1_ABC" ~ "ARG",
                          my_names == "ARG_G2_ABC" ~ "ARG",
                          my_names == "ARG_G1_EF" ~  "ARG",
                          my_names == "ARG_G2_EF" ~  "ARG",
                          my_names == "NARR_G1_ABC" ~ "NARR",
                          my_names == "NARR_G2_ABC" ~ "NARR",
                          my_names == "NARR_G1_EF" ~  "NARR",
                          my_names == "NARR_G2_EF" ~  "NARR"))

数据集:

> dput(data)
structure(list(ID = structure(c("PART_1", "PART_2", "PART_3", 
"PART_4", "PART_5", "PART_6", "PART_7", "PART_8", "PART_9", "PART_10", 
"PART_11", "PART_12", "PART_13", "PART_14", "PART_15", "PART_16", 
"PART_17", "PART_18", "PART_19", "PART_20", "PART_21", "PART_22", 
"PART_23", "PART_24", "PART_25", "PART_26", "PART_27", "PART_28", 
"PART_29", "PART_30", "PART_31", "PART_32", "PART_33", "PART_34", 
"PART_35", "PART_36", "PART_37", "PART_38", "PART_39", "PART_40", 
"PART_41", "PART_42", "PART_43", "PART_44", "PART_45", "PART_46", 
"PART_47", "PART_48", "PART_49", "PART_50", "PART_51", "PART_52", 
"PART_53", "PART_54", "PART_55", "PART_56", "PART_57", "PART_58", 
"PART_59", "PART_60", "PART_61", "PART_62", "PART_63", "PART_64", 
"PART_65", "PART_66", "PART_67", "PART_68", "PART_69", "PART_70", 
"PART_71"), class = c("glue", "character")), IND_TEST_SCORE = c(100, 
36, 32, 96, 11, 12, 32, 72, 100, 64, 2, 19, 99, 86, 60, 108, 
95, 35, 60, 9, 78, 61, 61, 67, 105, 99, 51, 21, 65, 30, 0.9, 
77, 54, 14, 103, 48, 0.7, 2, 39, 94, 80, 8, 30, 103, 113, 91, 
59, 56, 86, 99, 72, 34, 32, 6, 44, 99, 65, 98, 110, 102, 87, 
50, 89, 36, 93, 8, 11, 78, 48, 77, 4), ARG_G1_ABC = c(68.53, 
65.9, 69.78, 68.29, NaN, 69.5, 67.05, 73.74, 73.59, 72.57, 64.33, 
67.79, 72.94, 63.75, 71.56, 75.5, 68.16, NA, 65.64, 68.36, 69.75, 
72.73, 67.67, 66.19, 62.94, 72.48, 72.19, 62.44, 72.5, 71.06, 
70.4, 69.14, NA, 67.59, 69.1, 74.05, NA, 68.6, 68.27, 59.12, 
NA, NA, 63.7, 67.18, NA, 68.38, 63.44, 72.56, 66.06, 66.53, 73.19, 
NA, NA, NA, 73.44, 67.45, 72.91, 65.81, 73.96, 75, 75.89, 72, 
NA, 68.2, 67.29, 69.91, NaN, 69.67, 68.39, 69.2, 67.55), NARR_G1_ABC = c(71.32, 
NA, NA, 67.83, NaN, 71.6, 64.2, 71.68, 73.29, 70.53, 73.35, 59.31, 
71.08, 74.06, 68.7, 74, 69.08, NA, 68.52, 63.47, 68.33, NA, 65.64, 
62.11, 63.9, 70.41, 60.36, 65.88, 68.81, 69.62, 70.68, 67.5, 
NA, 68.45, 67.16, 74.39, 60.6, 65.89, 71.94, 68.75, NA, NA, 67, 
66.85, NA, NA, 62.56, 73.33, 69.81, 67.68, 73.06, 65.8, 63.85, 
NA, 67.64, 71.6, 68.47, 69.39, 71.16, 72.33, NA, 66.68, NA, 66.22, 
67, 61.27, NaN, 72.33, 68.29, 71.33, 65.57), ARG_G1_EF = c(4.94, 
6.55, 2.44, 3, NaN, 3.25, 4.71, 2.84, 1.07, 2, 5.33, 5.43, 1.72, 
10.55, 3, 1.17, 5.8, NA, 10.55, 4.21, 2.94, 3.55, 6.33, 8.25, 
5.88, 2, 3.44, 9.22, 1.69, 4.18, 2.5, 4.71, NA, 4.41, 5.9, 2.21, 
NA, 6.67, 3.33, 7, NA, NA, 8, 4.76, NA, 4.44, 2.68, 3.16, 4.94, 
5.42, 2.81, NA, NA, NA, 1.78, 6.09, 2.52, 6.56, 1.96, 1.12, 0.67, 
3.78, NA, 3.5, 3.65, 5.27, NaN, 4.33, 6.78, 3.6, 4.35), NARR_G1_EF = c(3.42, 
NA, NA, 3.17, NaN, 2.5, 3.29, 1.64, 1.07, 6, 1.41, 9.25, 3.25, 
2.69, 3.8, 1.32, 3.04, NA, 2.38, 5.18, 2.38, NA, 6.18, 6.11, 
6.4, 1.85, 7.45, 3.69, 1.89, 3.25, 1.6, 4.8, NA, 2.8, 4.32, 2.3, 
6.6, 7.42, 2.83, 4.75, NA, NA, 5, 4.75, NA, NA, 8, 1.71, 2.67, 
2.05, 1.47, 4.8, 7.96, NA, 4.43, 3.8, 4.47, 4.91, 1.68, 2.78, 
NA, 6.58, NA, 6.67, 6, 5.18, NaN, 1.67, 4.86, 2.08, 4.38), ARG_G2_ABC = c(64.9, 
63.8, 71.73, 67.67, NA, NA, 52.5, 72.35, 65.28, 57.22, NA, NaN, 
69, 66.67, NaN, 66.58, 69, 60.55, 56.29, 67.45, 68.4, 64.25, 
NaN, 50.86, 67.83, 65.96, 57, 53.07, 66.89, NaN, NA, 59, 61.5, 
NA, 65.9, 64.07, NA, NA, 57.91, 67.89, 68.75, 68.5, NaN, 63.24, 
66.19, 60.59, 59.24, 54.33, 64.39, 65.83, 65.71, 63, 63.78, 63.62, 
64, 65.08, NA, 67.61, 67.57, 72.71, 65.46, 61.71, NA, 57.62, 
NA, NA, NA, 64, 61.33, 62.64, NA), NARR_G2_ABC = c(64.25, 59, 
NA, 67.88, 67.08, NA, 60.75, 64.42, 71.17, 58.42, NA, 49.8, 63.36, 
65.2, NaN, 70.2, 62.85, NaN, 61.6, 53.92, 62.63, NA, NaN, 50.46, 
65.14, 60.58, 63.29, NA, 64.33, NaN, NA, 68.57, NA, NA, 66.3, 
NA, 57.29, NA, 53.5, 63.48, NA, 57.07, NaN, 61.82, NA, 68.61, 
57.1, 62.84, 63, 61.91, 58.38, NaN, 61.56, NA, NaN, 65.55, 63.8, 
65, 63.14, 67.31, 67.75, 57.62, 63.31, 54.83, 66.43, NA, NA, 
64.67, 57.92, 59, NA), ARG_G2_EF = c(7.1, 7.4, 1.09, 3.67, NA, 
NA, 12.75, 1.24, 3.28, 9.78, NA, NaN, 1.71, 1.93, NaN, 6.21, 
2.76, 7.91, 8.65, 3.55, 3.4, 5, NaN, 16.05, 3.39, 4.52, 13, 11.6, 
5.05, NaN, NA, 9.5, 9.67, NA, 7.03, 3.87, NA, NA, 8, 3.33, 2.19, 
3, NaN, 8.53, 3.37, 5.47, 7.35, 13.48, 5.33, 3.83, 3.65, 5.82, 
4, 6.17, 6, 6.42, NA, 3.83, 2.71, 2.19, 4.58, 5.18, NA, 9.75, 
NA, NA, NA, 5, 6.44, 5.36, NA), NARR_G2_EF = c(5.08, 7, NA, 1.76, 
3, NA, 8.88, 4.26, 2.92, 7.08, NA, 10.6, 5.5, 4.16, NaN, 2.87, 
4.7, NaN, 7, 9.5, 4.68, NA, NaN, 12.75, 4.77, 9.15, 5, NA, 5.44, 
NaN, NA, 4.57, NA, NA, 1.7, NA, 11.29, NA, 13.33, 5.95, NA, 10.79, 
NaN, 5.18, NA, 5.22, 7.1, 3.53, 5.75, 6.77, 6.31, NaN, 7.88, 
NA, NaN, 3, 4.88, 4.69, 6.19, 10.31, 3.62, 9.75, 5.46, 6.83, 
4.43, NA, NA, 3.67, 8.67, 8.53, NA)), row.names = c(NA, -71L), class = "data.frame")
1个回答

3

我们可以使用 pivot_longer 函数,通过设置matches参数来选择以字符串末尾_ABC_EF结尾的列名,并且通过指定names_sep_来分割列名,同时还需在names_to中指定对应的列名。其中,.value将返回列的值,而TYPE或GROUP则从列名中获取第一和第二个子字符串。

library(tidyr)
pivot_longer(data, cols = matches('_(ABC|EF)$'), 
   names_to = c("TYPE", "GROUP", ".value"), 
     names_sep = "_", values_drop_na = TRUE)

-输出

# A tibble: 217 × 6
   ID     IND_TEST_SCORE TYPE  GROUP   ABC    EF
   <glue>          <dbl> <chr> <chr> <dbl> <dbl>
 1 PART_1            100 ARG   G1     68.5  4.94
 2 PART_1            100 NARR  G1     71.3  3.42
 3 PART_1            100 ARG   G2     64.9  7.1 
 4 PART_1            100 NARR  G2     64.2  5.08
 5 PART_2             36 ARG   G1     65.9  6.55
 6 PART_2             36 ARG   G2     63.8  7.4 
 7 PART_2             36 NARR  G2     59    7   
 8 PART_3             32 ARG   G1     69.8  2.44
 9 PART_3             32 ARG   G2     71.7  1.09
10 PART_4             96 ARG   G1     68.3  3   
# … with 207 more rows

哇,太棒了!我简直不敢相信我之前花了这么长时间去硬编码它...非常感谢你详细的解释。实际上,我有一个很大的数据框,我试图在示例中使它更简单,但我刚刚注意到我应该添加一些东西。我的列名在ABC或EF之前都有G1_100_G2_100_(即NARR_G2_100_ABC)。我想我可以使用两个组的正则表达式并捕获//1 //2来匹配列名,对吗?@akrun - Larissa Cury
我在思考如何将 string <- "NARR_G2_100_ABC" 转换为 str_replace(string, '^([^_]+)_([^_]+)_([^_]+)_([^_]+)', '\\1 \\2')。我知道 names_pattern() 可以像 str_extract 一样工作,但我找不到使用它的方法... - Larissa Cury
嗨,我真的希望你能看到这个消息。现在我需要将相同的数据拆分为 "NARR_G2_100_ABC" -> NARR_ABC NARR_EFARG_ABC ARG_EF(包含 ABC、EF 参考 NARR 或 ARG 的值)和 GROUP(包含 G1 或 G2)。我正在尝试找到正确的正则表达式,但到目前为止我失败了... - Larissa Cury

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接