使用powerjoin
包的不同方法:
contact <- tribble(
~name, ~phone, ~email,
"John", 123, "john_abc@gmail.com",
"John", 456, "john_abc@gmail.com",
"John", 456, "john_xyz@gmail.com",
"John", 789, "john_pqr@gmail.com") |>
mutate(row_id = row_number())
library(powerjoin)
library(dplyr)
# check duplicated entries in phone column
phone_check <- contact |>
power_right_join(filter(contact, duplicated(phone)),
by = c("name", "phone"),
conflict = ~ paste(.x, .y, sep = ";")
) |>
group_by(phone) |>
slice(1) |>
tidyr::separate_rows(row_id) |>
ungroup() |>
select(name, email, row_id)
# check duplicated entries in email column
email_check <- contact |>
power_right_join(filter(contact, duplicated(email)),
by = c("name", "email"),
conflict = ~ paste(.x, .y, sep = ";")
) |>
group_by(email) |>
slice(1) |>
tidyr::separate_rows(row_id) |>
ungroup() |>
select(name, phone, row_id)
email_check |> select(name, phone, row_id) |>
inner_join(phone_check, by = c("name", "row_id")) |>
bind_rows(
contact |>
mutate(phone = as.character(phone),
row_id = as.character(row_id)) |>
filter(!row_id
) |>
select(-row_id)
# A tibble: 2 × 3
name phone email
<chr> <chr> <chr>
1 John 123;456 john_abc@gmail.com;john_xyz@gmail.com
2 John 789 john_pqr@gmail.com
grp = components(graph_from_data_frame(contact[ , c(2, 3, 1)]))$membership
;aggregate(. ~ grp[contact$email], function(x) toString(unique(x)), data = contact)
。 - Henrik