Question

我有一个如下所示的数据框：

>head(df)

chrom   pos strand  ref alt A_pos   A_neg   C_pos   C_neg   G_pos   G_neg   T_pos   T_neg
chr1    2283161 -   G   A   3       1       2       0       0       0        0      0
chr1    2283161 -   G   A   3       1       2       0       0       0        0      0
chr1    2283313 -   G   C   0       0       0       0       0       0        0      0
chr1    2283313 -   G   C   0       0       0       0       0       0        0      0
chr1    2283896 -   G   A   0       0       0       0       0       0        0      0
chr1    2283896 +   G   A   0       0       0       0       0       0        0      0

我想根据列'strand'，'ref'和'alt'的值从列6:13（A_pos ... T_neg）中提取值。例如，在row1：strand =' - '，ref ='G'和alt ='A'，所以我应该从G_neg和A_neg中提取值。同样，在第6行：stand ='+'，ref ='G'和alt ='A'，所以我应该从G_pos和A_pos获取值。我基本上打算在提取这些值后进行卡方检验（这些是我的观察值，我有另一组预期值）但这是另一个故事。

所以逻辑有点像：

if(df$strand=="+")
do
  print:paste(df$ref,"pos",sep="_") #extract value in column df$ref_pos
  print:paste(df$alt,"pos",sep="_") #extract value in column df$alt_pos

else if(gt.merge$gene_strand=="-")
do
  print:paste(df$ref,"neg",sep="_") #extract value in column df$ref_neg
  print:paste(df$alt,"neg",sep="_") #extract value in column df$alt_neg

在这里，我尝试使用粘贴'ref'和'alt'中的值来获取所需的列名。例如，如果strand ='+'和ref ='G'，它将从列G_pos中获取值。

数据框实际上很大，所以我排除了使用for循环。我不知道如何才能使代码尽可能高效。任何帮助/建议将不胜感激。

谢谢！

Answer 1

不是很优雅，但做的工作是：

strand.map <- c("-"="_neg", "+"="_pos")
cbind(
  df[1:5],
  do.call(
    rbind,
    lapply(
      split(df[-(1:2)], 1:nrow(df)), 
      function(x) 
        c(
          ref=x[-(1:2)][, paste0(x[[2]], strand.map[x[[1]]])], 
          alt=x[-(1:2)][, paste0(x[[3]], strand.map[x[[1]]])]
) ) ) )

我们遍历数据框中的每一行，并应用一个根据strand，ref和alt提取值的函数。这会产生：

  chrom     pos strand ref alt ref alt
1  chr1 2283161      -   G   A   0   1
2  chr1 2283161      -   G   A   0   1
3  chr1 2283313      -   G   C   0   0
4  chr1 2283313      -   G   C   0   0
5  chr1 2283896      -   G   A   0   0
6  chr1 2283896      +   G   A   0   0

另一种方法是使用melt，但数据的格式使它相当恼人，因为我们需要连续两个融合，我们需要创建一个唯一的id列，以便我们可以重建数据框一旦我们完成了计算。

df$id <- 1:nrow(df)
df.mlt <- 
  melt(
    melt(df, id.vars=c("id", "chrom", "pos", "strand", "ref", "alt")),
    measure.vars=c("ref", "alt"), value.name="base",
    variable.name="alt_or_ref"
  )
dcast(
  subset(df.mlt, paste0(base, strand.map[strand]) == variable),
  id + chrom + pos + strand ~ alt_or_ref,
  value.var="value"
)

产生：

  id chrom     pos strand ref alt
1  1  chr1 2283161      -   0   1
2  2  chr1 2283161      -   0   1
3  3  chr1 2283313      -   0   0
4  4  chr1 2283313      -   0   0
5  5  chr1 2283896      -   0   0
6  6  chr1 2283896      +   0   0

Answer 2

另一种看起来有效的替代方案，至少是样本数据：

tmp = ifelse(as.character(DF$strand) == "-", "neg", "pos")
sapply(DF[c("ref", "alt")], 
       function(x) as.integer(DF[cbind(seq_len(nrow(DF)), 
                                       match(paste(x, tmp, sep = "_"), names(DF)))]))
#     ref alt
#[1,]   0   1
#[2,]   0   1
#[3,]   0   0
#[4,]   0   0
#[5,]   0   0
#[6,]   0   0

DF：

DF = structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), 
    pos = c(2283161L, 2283161L, 2283313L, 2283313L, 2283896L, 
    2283896L), strand = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("-", 
    "+"), class = "factor"), ref = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L), .Label = "G", class = "factor"), alt = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L), .Label = c("A", "C"), class = "factor"), 
    A_pos = c(3L, 3L, 0L, 0L, 0L, 0L), A_neg = c(1L, 1L, 0L, 
    0L, 0L, 0L), C_pos = c(2L, 2L, 0L, 0L, 0L, 0L), C_neg = c(0L, 
    0L, 0L, 0L, 0L, 0L), G_pos = c(0L, 0L, 0L, 0L, 0L, 0L), G_neg = c(0L, 
    0L, 0L, 0L, 0L, 0L), T_pos = c(0L, 0L, 0L, 0L, 0L, 0L), T_neg = c(0L, 
    0L, 0L, 0L, 0L, 0L)), .Names = c("chrom", "pos", "strand", 
"ref", "alt", "A_pos", "A_neg", "C_pos", "C_neg", "G_pos", "G_neg", 
"T_pos", "T_neg"), class = "data.frame", row.names = c(NA, -6L
))

Answer 3

另一种方式

testFunc <- function(x){
  posneg <- if(x["strand"] == "-") {"neg"} else {"pos"}
  cbind(as.numeric(x[paste0(x["ref"],"_",posneg)]), as.numeric(x[paste0(x["alt"],"_",posneg)]))
  }
temp <- t(apply(df, 1, testFunc))
colnames(temp) <- c("ref", "alt")

Answer 4

使用[very] fast data.table库：

library(data.table)    

df = fread('df.txt') # fastread

df[,ref := ifelse(strand == "-",
                   paste(ref,"neg",sep = "_"),
                   paste(ref,"pos",sep = "_"))]
df[,alt := ifelse(strand == "-",
                   paste(alt,"neg",sep = "_"),
                   paste(alt,"pos",sep = "_"))]
df[,strand := NULL] # not required anymore

dfm = melt(df, 
       id.vars = c("chrom","pos","ref","alt"), 
       variable.name = "mycol", value.name = "value")

dfm[mycol == ref | mycol == alt,] # matching

如何基于多个其他列提取列的值

4 个答案: