我会采取以下方法:
我会采取以下方法:
library(data.table)
setDT(df)
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
以下是结果:
df[]
# id sub HLA_A1_1 HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N NA 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01 NA NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
除了比起被接受的答案更容易扩展(事物并没有真正地硬编码),这个方法至少比那个方法快两倍,而且比"整洁宇宙"方法要快得多,因为它首先将数据变得非常长,然后再返回到宽格式。
基准测试
为了了解性能差异,请尝试以下操作:
测试功能
myfun <- function(df) {
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
df[]
}
tidyfun <- function(df) {
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
}
getIt <- function(df,col) {
x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
q <- colsplit(string = as.character(df[,col]),pattern="/",
names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q)
}
reshape2fun <- function(dfdf) {
cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}
4 rows....
library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
10,000行......
biggerdf <- rbindlist(replicate(2500, df, FALSE))
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
1,000,000 rows....
BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE))
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf))
microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
separate(mydata, HLA_A2, into =c("HLA_A1_1", "HLA_A1_2"), sep ="/")
将按照描述执行,但会跳过后面的任何字段,例如第二行。 - Stephen Henderson