R将列中的值组合成行

时间:2018-02-09 15:28:08

标签: r

在每个基因组(OG1,OG2)中,我都有相同的生物体。

每个生物体在给定组中具有一个或多个基因。然而,每种生物的基因数量因组而异。在下面的例子中,P.fragile在OG1中有3个基因,但在OG2中只有2个基因。

为了比较所有基因,我需要重新排列表:在一组中,生物体的每个基因应该与其他生物的所有基因组合连续列出。我提供了输出的外观。

生物体名称可以在输出中省略,因为gene_ID包含生物体名称的一部分。 我使用dplyer包对数据进行分组:

    group_by(data,group)

但由于每个生物体在每个基因组中都有不同数量的基因,我被困住了。

输入表:

   df <- structure(list(gene_ID = c("PF_1", "PF_2", "PF_3", "PI_1", "PI_2", 
    "PI_3", "PB_1", "PB_2", "PFa_1", "PFa_2", "PIa_1", "PIa_2", "PBa_1", 
    "PBa_2", "PBa_3"), organism = c("P. fragile", "P. fragile", "P. fragile", 
    "P. inui", "P. inui", "P. inui", "P. berghei", "P. berghei", 
    "P. fragile", "P. fragile", "P. inui", "P. inui", "P. berghei", 
    "P. berghei", "P. berghei"), group = c("OG1", "OG1", "OG1", "OG1", 
    "OG1", "OG1", "OG1", "OG1", "OG2", "OG2", "OG2", "OG2", "OG2", 
    "OG2", "OG2")), .Names = c("gene_ID", "organism", "group"), class = "data.frame", row.names = c(NA, 
    -15L))

输出表:

    group           
    OG1  PF_1   PI_1    PB_1
    OG1  PF_1   PI_1    PB_2
    OG1  PF_1   PI_2    PB_1
    OG1  PF_1   PI_2    PB_2
    OG1  PF_1   PI_3    PB_1
    OG1  PF_1   PI_3    PB_2
    OG1  PF_2   PI_1    PB_1
    OG1  PF_2   PI_1    PB_2
    OG1  PF_2   PI_2    PB_1
    OG1  PF_2   PI_2    PB_2
    OG1  PF_2   PI_3    PB_1
    OG1  PF_2   PI_3    PB_2
    OG1  PF_3   PI_1    PB_1
    OG1  PF_3   PI_1    PB_2
    OG1  PF_3   PI_2    PB_1
    OG1  PF_3   PI_2    PB_2
    OG1  PF_3   PI_3    PB_1
    OG1  PF_3   PI_3    PB_2
    OG2  PFa_1  PIa_1   PBa_1
    OG2  PFa_1  PIa_1   PBa_2
    OG2  PFa_1  PIa_1   PBa_3
    OG2  PFa_1  PIa_2   PBa_1
    OG2  PFa_1  PIa_2   PBa_2
    OG2  PFa_1  PIa_2   PBa_3
    OG2  PFa_2  PIa_1   PBa_1
    OG2  PFa_2  PIa_1   PBa_2
    OG2  PFa_2  PIa_1   PBa_3
    OG2  PFa_2  PIa_2   PBa_1
    OG2  PFa_2  PIa_2   PBa_2
    OG2  PFa_2  PIa_2   PBa_3

2 个答案:

答案 0 :(得分:2)

你可以这样做,我们为所有生物加入过滤后的表格。

library(purr)

df1 %>%
  split(.$organism) %>%
  imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
  reduce(inner_join)

# P. berghei group P. fragile P. inui   
# 1        PB_1   OG1       PF_1       PI_1
# 2        PB_1   OG1       PF_1       PI_2
# 3        PB_1   OG1       PF_1       PI_3
# 4        PB_1   OG1       PF_2       PI_1
# 5        PB_1   OG1       PF_2       PI_2
# 6        PB_1   OG1       PF_2       PI_3
# 7        PB_1   OG1       PF_3       PI_1
# 8        PB_1   OG1       PF_3       PI_2
# 9        PB_1   OG1       PF_3       PI_3
# 10       PB_2   OG1       PF_1       PI_1
# 11       PB_2   OG1       PF_1       PI_2
# 12       PB_2   OG1       PF_1       PI_3
# 13       PB_2   OG1       PF_2       PI_1
# 14       PB_2   OG1       PF_2       PI_2
# 15       PB_2   OG1       PF_2       PI_3
# 16       PB_2   OG1       PF_3       PI_1
# 17       PB_2   OG1       PF_3       PI_2
# 18       PB_2   OG1       PF_3       PI_3
# 19      PBa_1   OG2      PFa_1      PIa_1
# 20      PBa_1   OG2      PFa_1      PIa_2
# 21      PBa_1   OG2      PFa_2      PIa_1
# 22      PBa_1   OG2      PFa_2      PIa_2
# 23      PBa_2   OG2      PFa_1      PIa_1
# 24      PBa_2   OG2      PFa_1      PIa_2
# 25      PBa_2   OG2      PFa_2      PIa_1
# 26      PBa_2   OG2      PFa_2      PIa_2
# 27      PBa_3   OG2      PFa_1      PIa_1
# 28      PBa_3   OG2      PFa_1      PIa_2
# 29      PBa_3   OG2      PFa_2      PIa_1
# 30      PBa_3   OG2      PFa_2      PIa_2

数据

df1 <- read.table(text="gene_ID   organism     group
PF_1      'P. fragile'   OG1
PF_2      'P. fragile'   OG1
PF_3      'P. fragile'   OG1
PI_1      'P. inui   '   OG1
PI_2      'P. inui   '   OG1
PI_3      'P. inui   '   OG1
PB_1      'P. berghei'   OG1
PB_2      'P. berghei'   OG1
PFa_1     'P. fragile'   OG2
PFa_2     'P. fragile'   OG2
PIa_1     'P. inui   '   OG2
PIa_2     'P. inui   '   OG2
PBa_1     'P. berghei'   OG2
PBa_2     'P. berghei'   OG2
PBa_3     'P. berghei'   OG2",header=T,stringsAsFactors=F)

这是一个仅使用基本R和magrittr管道的版本:

df %>%
  split(.$organism) %>%
  Map(.,names(.),f=function(x,y) setNames(x[c(1,3)],c(y,"group"))) %>%
  Reduce(f=merge)

答案 1 :(得分:0)

它很冗长,但没有任何其他包装。

groups <- unique(df$group)

combined.genes <- data.frame()

for (i in 1:length(groups)) {

  current.group <- df[df$group==groups[i],-3]

  o <- unique(current.group$organism)

  genes<-lapply(1:length(o), function(x) {

    current.group[current.group$organism==o[x],1]
  })
  max.genes <- sort(table(current.group$organism),
                    decreasing=TRUE)[1]
  #Set list of same-sized vectors with genes by organism, allowing "merge" to work
  for(j in 1:length(o))
  {
    genes[[j]] <- c(genes[[j]],rep(NA,max.genes-length(genes[[j]])))
    if(j==1)
      tmp.df <- as.data.frame(genes[[j]])
    else if(j==2)
      tmp.df <- merge(tmp.df,genes[[j]])
    else
      tmp.df <- cbind(tmp.df[,-(j-1)],merge(tmp.df[,j-1],genes[[j]]))
  }
  tmp.df <- na.exclude(tmp.df)
  tmp.df <- cbind(rep(groups[i],nrow(tmp.df)), tmp.df)
  tmp.df <- tmp.df[order(tmp.df[,2],tmp.df[,3],tmp.df[,4]),]
  combined.genes <- rbind(combined.genes, tmp.df)
}
rownames(combined.genes)<-NULL
combined.genes