将数据框的列表中的每个数据框排序。

3

我有一个数据框列表 (dfAdfB),它们的行数不同:

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB

但是如何一次性地对列表中的每个数据框进行此操作呢? 如果我理解正确,您的代码将返回一个重新排列的数据框列表(以“宽”格式)。然后我需要将该列表转换为新的数据框。 (另一个问题是,列表中所有数据框的变量名称相似(即ID、Var1、Var2、Var3...),在这里我无法重现此问题。) 谢谢。 我的代码是:
genSeq <-  c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')

# Read raw data as character vector
a <- readLines(genSeq)

# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)

# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)

# Install some packages
  install.packages("stringr")
  install.packages("stringi")
  install.packages("xlsx")

# Load the packages
library(stringr)
library(stringi)
library(xlsx)

# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]

# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)

# Split lines
d <-  strsplit(bb, split = "")

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)

# Count number of variables ( https://dev59.com/O2Up5IYBdhLWcg3wj4Hu#15201478 )
max.length <- max(sapply(d, length))

# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})

# Combine all elements
do.call(rbind, d)

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)


# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))

# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)

# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)

# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)

# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))

# Some diagnostics
# names(ddd)

# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]

# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)

# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL

# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)

# Remove rows containing NA ( https://dev59.com/Vmsz5IYBdhLWcg3wR1vC )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))

# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)


# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)




# The list of unique allels accordingly to LL*NN:NN(NL) template
#####

# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]

# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)

# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)

# Let the list to be a character vector
unique.allels <- as.character(unique.allels)

# Show them:
# unique.allels

# Their number is:
# length(unique.allels)

# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)

# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))

# Show them:
# specific.HLA.protein

# Their number is:
# length(specific.HLA.protein)

# Export  them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)













##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for https://dev59.com/NJ_ha4cB1Zd3GeqP59-5
#####

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB


l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df


# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)


##################################################################################
# End of Example

# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)

unique(new_odrd_srtd_sbst$allel)




# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))

# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]

# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)

# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)


# Split dataframe into a list of data frames based on a value in allel variable
# https://dev59.com/e2Ml5IYBdhLWcg3wR1WD
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]

# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)

# From this step I fail to step further...

请问您能否使您的示例可重现?(使用 dput() 函数) - Sotos
我已经完成了。谢谢你的建议。 - abc
1
所以你想要什么样的输出呢?你是想得到一个列表,其中每个元素都是一个只有一行的数据框(DataFrame),还是想得到一个数据框(DataFrame),其中每一行都对应一个数据框?最重要的是,为什么你需要这样奇怪的数据结构呢? - David Arenburg
请为给定的示例添加预期输出。 - Ronak Shah
1
尽管进行了多次编辑,我仍不清楚该问题的要求是什么。 "如果我理解正确,你的代码..." 为什么是你的?作者在和谁交流? "这里我无法复现这个问题..." 是什么问题?回到开头,你只是想找一种输出Dfa和Dfb(数据框)每个元素而无需全部指定的方法吗?在最后一部分中您展示了两个“单个”示例,您要寻找什么样的期望输出?同样,您只是想为所有索引执行此操作而无需列出它们所有吗? - TMWP
@Sotos,d.b,Sathish:你们的答案都很棒。但是我的“ID”变量(在我的真实示例中我有两个ID字段)每当重新组织数据框的新行排列到新的数据框的所需单行时,就会重复出现。我已经自己尝试了24小时以上来解决这个问题,但失败了。请修复这个错误。我讨厌我的编码。 - abc
3个回答

3
这里有一个可能性,
l2 <- lapply(from, function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                     c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  a    1    2    3    4         6    .    8    9
#2  b   11   22   33   44        66 <NA> <NA> <NA>

当然,你可以避免使用 i[1,1] 的连接操作,这并不是你的要求,而是我认为可以应用在这里的一个额外操作。所以,通过避免这个操作,保留你原本的转置函数,你就能得到:

l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                    c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
#1  a    1    2    3    a    4         6    a    .     8     9
#2  b   11   22   33    b   44        66 <NA> <NA>  <NA>  <NA>

请按以下三个步骤尝试。

首先创建不带ID的数据框,


要翻译的内容中含有HTML标签,我已经保留了它们。如果您需要其他帮助,请告诉我。
l3 <- lapply(from, function(i) t(c(t(i[-1]))))
l3 <- lapply(l3, `length<-`, max(lengths(l3)))

 new_df1 <- setNames(data.frame(do.call(rbind, l3)), 
                     paste0('Var', seq(max(lengths(l3)))))

new_df1
#  Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  1    2    3    4         6    .    8    9
#2 11   22   33   44        66 <NA> <NA> <NA>

提取所有唯一的ID。
i1 <- sapply(from, function(i) unique(as.character(i[[1]])))
i1
#[1] "a" "b"

将它们绑在一起,

final_df1 <- cbind(IDs = i1, new_df1)

final_df1
#  IDs Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1   a  1    2    3    4         6    .    8    9
#2   b 11   22   33   44        66 <NA> <NA> <NA>

你在示例列表中的工作完美 :), 但是在真实数据中出现了错误 Error in i[1, 1] : incorrect number of dimensions。有什么提示吗?谢谢 - abc
也许可以用unique?例如:lapply(from, function(i) as.vector(c(unique(as.character(i[,1])), t(c(t(i[-1])))))) - Sotos
嗯,它还是不起作用。Sotos,请您逐步解释一下 function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))) 是做什么的? - abc
这些 data.frames,它们是存储在您的驱动器中还是作为 R 中带有名称模式的对象? - Mario GS
@Sotos,你用我的真实数据集创建的“new_df1”返回了一个ID列表... 当我使用typeof()函数检查我的真实列表时,它返回[1] "list" - abc

3

按照您的示例:

library(data.table)
# Create a list of data frames
from <- list(dfA, dfB)
from
[[1]]
  IDA Var1 Var2 Var3
1   a    1    2    3
2   a    4         6
3   a    .    8    9

[[2]]
  IDB Var4 Var5 Var6
1   b   11   22   33
2   b   44        

# rbind all the elements in the list of data.tables
    out <- lapply(from, function(x){as.data.table(t(c(t(x))))} )
    out <- rbindlist(out, fill =  TRUE)
    out
       V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
    1:  a  1  2  3  a  4     6  a   .   8   9
    2:  b 11 22 33  b 44    66 NA  NA  NA  NA

# If the files are stored on your drive, you can call them by bulk, and then `rbindlist`:

    files <- list.files(pattern = ".csv")
    files <- lapply(files, fread)

谢谢您的回答。但是当我运行 library(data.table) 时,出现了 Error in library(data.table) : there is no package called ‘data.table’ 的错误提示。当我运行 install.packages(data.table) 时,又出现了 Error in install.packages : object 'data.table' not found 的错误提示。 - abc
1
我不知道你在使用哪个版本的R,你可以更新你的答案并附上R.Version()$version.string。但通常安装包时需要加引号:install.packages("data.table", dependencies = T),请尝试一下。 - Mario GS
我尝试了。谢谢你的建议。虽然它起作用了,但是像其他例子一样,我在长行中多次看到我的ID几次。 - abc
@stan,抱歉,你的问题太长了,我有点迷失。你能指出你真正的数据在哪里吗? - Mario GS
我觉得我没有正确理解你的需求。例如,在这一行中,1: a 1 2 3 a 4 6 a. 8 9中,id“a”重复了两次。从我的理解来看,您只需要一次。例如:1:a 1 2 3 4 6.8 9? - Mario GS

1
我觉得你可以使用lapply来遍历列表中的所有data.frame,以执行你已经在每个单独的data.frame上执行的操作。只需确保对每个向量进行子集处理,使输出中的列数等于具有最大元素数量的data.frame的元素数量。这个最大数量(本例中的max_length)可以通过对每个data.frame进行展开,使用lengths获取元素数量,然后使用max获取最大元素数量来获得。
max_length = max(lengths(lapply(from, unlist)))
do.call(rbind, lapply(from, function(df)
    t(c(t(df)))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "a"  "4"  " "  "6"  "a"  "."   "8"   "9"  
#[2,] "b"  "11" "22" "33" "b"  "44" " "  "66" NA   NA    NA    NA

UPDATE

do.call(rbind, lapply(from, function(df)
     c(as.character(df[1,1]), t(c(t(df[,-1]))))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "4"  " "  "6"  "."  "8"  "9"   NA    NA   
#[2,] "b"  "11" "22" "33" "44" " "  "66" NA   NA   NA    NA    NA   

当然可以,但我希望我的IDA和IDB仅在每行的开头出现一次。这段代码将许多行重新排列为一行,但不幸的是,在新的行中,每个ID会重复多次。如何纠正这段代码? - abc

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接