我有一个数据框,有两列,一列用于基因符号,另一列用于功能途径。途径列具有重复值,因为存在许多属于每个途径的基因。我想重新排序这个数据集,以便每个列都是一个单独的路径,这些列中的每一行都是属于该路径的基因。
启动数据帧:
data.frame(pathway = c("p1", "p1", "p1", "p1", "p2", "p2", "p2"),
gene.symbol = c("G1", "G2", "G3", "G4", "G33", "G43", "G10"))
所需数据框:
data.frame(p1 = c("G1", "G2", "G3", "G4"), p2 = c("G33", "G43", "G10",
""))
我知道并非所有列都具有相同的长度,并且具有空白值优于NA。
答案 0 :(得分:0)
这可能看起来有点复杂,但它通过首先转到列表而不是返回data.frame来实现所需的输出:
df$gene.symbol <- as.character(df$gene.symbol)
pw_list <- list()
for (pw in unique(df$pathway)) {
pw_list[[pw]] <- df[df$pathway == pw, "gene.symbol"]
}
pw_list
$p1
[1] "G1" "G2" "G3" "G4"
$p2
[1] "G33" "G43" "G10"
reordered <- matrix("", nrow = max(sapply(pw_list, length)), ncol = length(pw_list))
colnames(reordered) <- names(pw_list)
for (pw in names(pw_list)){
n <- length(pw_list[[pw]])
reordered[1:n, pw] <- pw_list[[pw]]
}
reordered <- as.data.frame(reordered)
reordered
p1 p2
1 G1 G33
2 G2 G43
3 G3 G10
4 G4
修改强>
稍微简洁的版本:
df$gene.symbol <- as.character(df$gene.symbol)
pw_list <- list()
for (pw in unique(df$pathway)) {
pw_list[[pw]] <- df[df$pathway == pw, "gene.symbol"]
}
reordered <- as.data.frame(sapply(pw_list, "[", i = 1:max(sapply(pw_list, length))),
stringsAsFactors = FALSE)
reordered[is.na(reordered)] <- ""
names(reordered) <- names(pw_list)
答案 1 :(得分:0)
这是另一种选择。
这是代码。
mydf <- data.frame(pathway = c("p1", "p1", "p1", "p1", "p2", "p2", "p2"),
gene.symbol = c("G1", "G2", "G3", "G4", "G33", "G43", "G10"))
# function to run over each element in list
set_to_max_length <- function(x) {
length(x) <- max.length
return(x)
}
# 1. split into list
mydf.split <- split(mydf$gene.symbol, mydf$pathway)
# 2.a get max length of all columns
max.length <- max(sapply(mydf.split, length))
# 2.b set each list element to max length
mydf.split.2 <- lapply(mydf.split, set_to_max_length)
# 3. combine back into df
data.frame(mydf.split.2)
修改强>
这是使用tidyverse的另一种选择 - 更简洁:
library(tidyverse)
mydf <- data.frame(pathway = c("p1", "p1", "p1", "p1", "p2", "p2", "p2"),
gene.symbol = c("G1", "G2", "G3", "G4", "G33", "G43", "G10"))
mydf %>%
group_by(pathway) %>%
mutate(rownum = row_number()) %>%
ungroup() %>%
spread(pathway, gene.symbol) %>%
select(-1)