在每个基因组(OG1,OG2)中,我都有相同的生物体。
每个生物体在给定组中具有一个或多个基因。然而,每种生物的基因数量因组而异。在下面的例子中,P.fragile在OG1中有3个基因,但在OG2中只有2个基因。
为了比较所有基因,我需要重新排列表:在一组中,生物体的每个基因应该与其他生物的所有基因组合连续列出。我提供了输出的外观。
生物体名称可以在输出中省略,因为gene_ID包含生物体名称的一部分。 我使用dplyer包对数据进行分组:
group_by(data,group)
但由于每个生物体在每个基因组中都有不同数量的基因,我被困住了。
输入表:
df <- structure(list(gene_ID = c("PF_1", "PF_2", "PF_3", "PI_1", "PI_2",
"PI_3", "PB_1", "PB_2", "PFa_1", "PFa_2", "PIa_1", "PIa_2", "PBa_1",
"PBa_2", "PBa_3"), organism = c("P. fragile", "P. fragile", "P. fragile",
"P. inui", "P. inui", "P. inui", "P. berghei", "P. berghei",
"P. fragile", "P. fragile", "P. inui", "P. inui", "P. berghei",
"P. berghei", "P. berghei"), group = c("OG1", "OG1", "OG1", "OG1",
"OG1", "OG1", "OG1", "OG1", "OG2", "OG2", "OG2", "OG2", "OG2",
"OG2", "OG2")), .Names = c("gene_ID", "organism", "group"), class = "data.frame", row.names = c(NA,
-15L))
输出表:
group
OG1 PF_1 PI_1 PB_1
OG1 PF_1 PI_1 PB_2
OG1 PF_1 PI_2 PB_1
OG1 PF_1 PI_2 PB_2
OG1 PF_1 PI_3 PB_1
OG1 PF_1 PI_3 PB_2
OG1 PF_2 PI_1 PB_1
OG1 PF_2 PI_1 PB_2
OG1 PF_2 PI_2 PB_1
OG1 PF_2 PI_2 PB_2
OG1 PF_2 PI_3 PB_1
OG1 PF_2 PI_3 PB_2
OG1 PF_3 PI_1 PB_1
OG1 PF_3 PI_1 PB_2
OG1 PF_3 PI_2 PB_1
OG1 PF_3 PI_2 PB_2
OG1 PF_3 PI_3 PB_1
OG1 PF_3 PI_3 PB_2
OG2 PFa_1 PIa_1 PBa_1
OG2 PFa_1 PIa_1 PBa_2
OG2 PFa_1 PIa_1 PBa_3
OG2 PFa_1 PIa_2 PBa_1
OG2 PFa_1 PIa_2 PBa_2
OG2 PFa_1 PIa_2 PBa_3
OG2 PFa_2 PIa_1 PBa_1
OG2 PFa_2 PIa_1 PBa_2
OG2 PFa_2 PIa_1 PBa_3
OG2 PFa_2 PIa_2 PBa_1
OG2 PFa_2 PIa_2 PBa_2
OG2 PFa_2 PIa_2 PBa_3
答案 0 :(得分:2)
你可以这样做,我们为所有生物加入过滤后的表格。
library(purr)
df1 %>%
split(.$organism) %>%
imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
reduce(inner_join)
# P. berghei group P. fragile P. inui
# 1 PB_1 OG1 PF_1 PI_1
# 2 PB_1 OG1 PF_1 PI_2
# 3 PB_1 OG1 PF_1 PI_3
# 4 PB_1 OG1 PF_2 PI_1
# 5 PB_1 OG1 PF_2 PI_2
# 6 PB_1 OG1 PF_2 PI_3
# 7 PB_1 OG1 PF_3 PI_1
# 8 PB_1 OG1 PF_3 PI_2
# 9 PB_1 OG1 PF_3 PI_3
# 10 PB_2 OG1 PF_1 PI_1
# 11 PB_2 OG1 PF_1 PI_2
# 12 PB_2 OG1 PF_1 PI_3
# 13 PB_2 OG1 PF_2 PI_1
# 14 PB_2 OG1 PF_2 PI_2
# 15 PB_2 OG1 PF_2 PI_3
# 16 PB_2 OG1 PF_3 PI_1
# 17 PB_2 OG1 PF_3 PI_2
# 18 PB_2 OG1 PF_3 PI_3
# 19 PBa_1 OG2 PFa_1 PIa_1
# 20 PBa_1 OG2 PFa_1 PIa_2
# 21 PBa_1 OG2 PFa_2 PIa_1
# 22 PBa_1 OG2 PFa_2 PIa_2
# 23 PBa_2 OG2 PFa_1 PIa_1
# 24 PBa_2 OG2 PFa_1 PIa_2
# 25 PBa_2 OG2 PFa_2 PIa_1
# 26 PBa_2 OG2 PFa_2 PIa_2
# 27 PBa_3 OG2 PFa_1 PIa_1
# 28 PBa_3 OG2 PFa_1 PIa_2
# 29 PBa_3 OG2 PFa_2 PIa_1
# 30 PBa_3 OG2 PFa_2 PIa_2
数据强>
df1 <- read.table(text="gene_ID organism group
PF_1 'P. fragile' OG1
PF_2 'P. fragile' OG1
PF_3 'P. fragile' OG1
PI_1 'P. inui ' OG1
PI_2 'P. inui ' OG1
PI_3 'P. inui ' OG1
PB_1 'P. berghei' OG1
PB_2 'P. berghei' OG1
PFa_1 'P. fragile' OG2
PFa_2 'P. fragile' OG2
PIa_1 'P. inui ' OG2
PIa_2 'P. inui ' OG2
PBa_1 'P. berghei' OG2
PBa_2 'P. berghei' OG2
PBa_3 'P. berghei' OG2",header=T,stringsAsFactors=F)
这是一个仅使用基本R和magrittr
管道的版本:
df %>%
split(.$organism) %>%
Map(.,names(.),f=function(x,y) setNames(x[c(1,3)],c(y,"group"))) %>%
Reduce(f=merge)
答案 1 :(得分:0)
它很冗长,但没有任何其他包装。
groups <- unique(df$group)
combined.genes <- data.frame()
for (i in 1:length(groups)) {
current.group <- df[df$group==groups[i],-3]
o <- unique(current.group$organism)
genes<-lapply(1:length(o), function(x) {
current.group[current.group$organism==o[x],1]
})
max.genes <- sort(table(current.group$organism),
decreasing=TRUE)[1]
#Set list of same-sized vectors with genes by organism, allowing "merge" to work
for(j in 1:length(o))
{
genes[[j]] <- c(genes[[j]],rep(NA,max.genes-length(genes[[j]])))
if(j==1)
tmp.df <- as.data.frame(genes[[j]])
else if(j==2)
tmp.df <- merge(tmp.df,genes[[j]])
else
tmp.df <- cbind(tmp.df[,-(j-1)],merge(tmp.df[,j-1],genes[[j]]))
}
tmp.df <- na.exclude(tmp.df)
tmp.df <- cbind(rep(groups[i],nrow(tmp.df)), tmp.df)
tmp.df <- tmp.df[order(tmp.df[,2],tmp.df[,3],tmp.df[,4]),]
combined.genes <- rbind(combined.genes, tmp.df)
}
rownames(combined.genes)<-NULL
combined.genes