我有一个列表from
数据框(dfA
和dfB
),行数不同:
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
但是如何一次对列表中的每个数据帧执行此操作? 如果我理解正确,您的代码将返回重新排列的数据帧列表(以&#34;宽&#34;格式)。然后我需要将列表转换为新的数据框。
(另一个问题是列表中的所有数据帧都有类似命名的变量(列表中每个df的id ID,Var1,Var2,Var3 ...)。这里我不能重现这个问题。)
谢谢。
我的代码是:
genSeq <- c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')
# Read raw data as character vector
a <- readLines(genSeq)
# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)
# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)
# Install some packages
install.packages("stringr")
install.packages("stringi")
install.packages("xlsx")
# Load the packages
library(stringr)
library(stringi)
library(xlsx)
# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]
# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)
# Split lines
d <- strsplit(bb, split = "")
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Count number of variables ( http://stackoverflow.com/a/15201478/1009306 )
max.length <- max(sapply(d, length))
# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})
# Combine all elements
do.call(rbind, d)
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))
# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)
# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)
# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)
# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))
# Some diagnostics
# names(ddd)
# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]
# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)
# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL
# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)
# Remove rows containing NA ( http://stackoverflow.com/q/8005154/1009306 )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))
# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)
# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)
# The list of unique allels accordingly to LL*NN:NN(NL) template
#####
# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]
# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)
# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)
# Let the list to be a character vector
unique.allels <- as.character(unique.allels)
# Show them:
# unique.allels
# Their number is:
# length(unique.allels)
# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)
# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))
# Show them:
# specific.HLA.protein
# Their number is:
# length(specific.HLA.protein)
# Export them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)
##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for http://stackoverflow.com/q/42711357
#####
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)
##################################################################################
# End of Example
# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)
unique(new_odrd_srtd_sbst$allel)
# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))
# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]
# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)
# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)
# Split dataframe into a list of data frames based on a value in allel variable
# http://stackoverflow.com/q/18527051
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]
# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)
# From this step I fail to step further...
答案 0 :(得分:3)
这是一种可能性,
l2 <- lapply(from, function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)),
c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 a 1 2 3 4 6 . 8 9
#2 b 11 22 33 44 66 <NA> <NA> <NA>
你当然可以避免与i[1,1]
串联,这不符合你的要求,而是我的想法可以在这里应用。因此,通过避免这种情况并保持原始的转置功能,你得到了
l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)),
c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
#1 a 1 2 3 a 4 6 a . 8 9
#2 b 11 22 33 b 44 66 <NA> <NA> <NA> <NA>
分三步尝试。
首先创建没有ID的数据框
l3 <- lapply(from, function(i) t(c(t(i[-1]))))
l3 <- lapply(l3, `length<-`, max(lengths(l3)))
new_df1 <- setNames(data.frame(do.call(rbind, l3)),
paste0('Var', seq(max(lengths(l3)))))
new_df1
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 1 2 3 4 6 . 8 9
#2 11 22 33 44 66 <NA> <NA> <NA>
提取所有唯一ID
i1 <- sapply(from, function(i) unique(as.character(i[[1]])))
i1
#[1] "a" "b"
将他们捆绑在一起,
final_df1 <- cbind(IDs = i1, new_df1)
final_df1
# IDs Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 a 1 2 3 4 6 . 8 9
#2 b 11 22 33 44 66 <NA> <NA> <NA>
答案 1 :(得分:3)
按照你的例子:
library(data.table)
# Create a list of data frames
from <- list(dfA, dfB)
from
[[1]]
IDA Var1 Var2 Var3
1 a 1 2 3
2 a 4 6
3 a . 8 9
[[2]]
IDB Var4 Var5 Var6
1 b 11 22 33
2 b 44
# rbind all the elements in the list of data.tables
out <- lapply(from, function(x){as.data.table(t(c(t(x))))} )
out <- rbindlist(out, fill = TRUE)
out
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
1: a 1 2 3 a 4 6 a . 8 9
2: b 11 22 33 b 44 66 NA NA NA NA
# If the files are stored on your drive, you can call them by bulk, and then `rbindlist`:
files <- list.files(pattern = ".csv")
files <- lapply(files, fread)
答案 2 :(得分:1)
我觉得您可以使用lapply
来迭代data.frame
中的所有list
来执行您在每个data.frame
上已经执行的操作。只需确保对每个向量进行子集化,使得输出中的列数等于data.frame
中具有最大元素数的元素数。通过展开每个max_length
,使用data.frame
获取元素数量,然后使用lengths
获取数量,可以获得此最大数量(此示例中为max
)最大元素。
max_length = max(lengths(lapply(from, unlist)))
do.call(rbind, lapply(from, function(df)
t(c(t(df)))[1:max_length]))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a" "1" "2" "3" "a" "4" " " "6" "a" "." "8" "9"
#[2,] "b" "11" "22" "33" "b" "44" " " "66" NA NA NA NA
<强>更新强>
do.call(rbind, lapply(from, function(df)
c(as.character(df[1,1]), t(c(t(df[,-1]))))[1:max_length]))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a" "1" "2" "3" "4" " " "6" "." "8" "9" NA NA
#[2,] "b" "11" "22" "33" "44" " " "66" NA NA NA NA NA