如何为几列具有相同数字的连续行的频率

时间:2015-12-05 13:17:49

标签: r

我有一个数据集如下:

structure(list(chr = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 
1, 0, 0, 1, 1, 1, 1), leftPos = c(240000, 1080000, 1200000, 1320000, 
1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 3480000, 
3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 5160000, 
5280000, 6e+06), chr.1 = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 
1, 0, 0, 1, 1, 1, 1, 1), leftPos.1 = c(240000, 1080000, 1200000, 
1320000, 1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 
3480000, 3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 
5160000, 5280000, 6e+06), ASample = c(0, 
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0), Sample1 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1), Sample2 = c(0, 
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), Sample3 = c(0, 
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1), Sample4 = c(0, 
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample5 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample6 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample7 = c(0, 
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Sample8 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample9 = c(0, 
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1), Sample10 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample11 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample12 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample13 = c(0, 
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), Sample14 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample15 = c(0, 
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1)), .Names = c("chr", 
"leftPos", "chr.1", "leftPos.1", "Sample1", 
"Sample2", 
"Sample3", "Sample4", 
"Sample5", "Sample6", 
"Sample7", "Sample8", 
"Sample9", "Sample10", 
"Sample11", "Sample12", 
"Sample13", "Sample14", 
"Sample15"), row.names = c(NA, 
20L), class = "data.frame")

我需要计算每列的行数,其中有多个相同的1或-1

我希望能够计算每列的连续行数,按chr分组,在染色体中有三个连续的1或-1(名为chr的列)。

理想的输出类似于(不是从上面的输入数据中获取)

chr numberOfConsecutive1s FreqSample1  FreqSample2  FreqSample3 etc
1          2                3           2               14
1          3                5           2               2
1          4                5           0               6
1          5                4           3               5
1          6                3           0               3
1          7                7           5               7
1          8                5           0               2
1          9                54          2               6
1          10               34          77              7
2          2                6           4               2
2          3                23          34              34
2          4                5           37              2
2          5                55          24              22
2          6                2           0               11
2          7                3           14              5
2          8                2           5               77
2          9                5           23              34
2          10               5           11              34
3          1                32          0               2

到目前为止,我已经尝试过以下操作,只需将非连续的1转换为0,所以我只剩下连续的1。我不知道如何根据所需的输出计算它们。

dx<-DAT_list2res
f0 <- function( colNr, dx )
{
  col <- dx[,colNr]
  n1 <- which( col == 1 )            # The `1`-rows.
  d0 <- which( diff(col) == 0 )      # Consecutive entries are equal.
  dc0 <- which( diff(dx[,1]) == 0 )  # Same chromosome.
  m <- intersect( n1-1, intersect( d0, dc0 ) )
  return ( setdiff( 1:nrow(dx), union(m,m+1) ) )
}
g <- function( dx )
{
  for ( i in 3:ncol(dx) ) { dx[f0(i,dx),i] <- 0 }  
  return ( dx )
}
dx<-g(dx)

修改

我也按照bramtayl的建议尝试了这个:

result = 
  consecFreq %>%
  select(-chr) %>%
  gather(variable, chr,  5:190) %>%
  group_by(variable) %>%
  mutate(ID = 
           chr %>%
           lag %>%
           `!=`(chr) %>%
           plyr::mapvalues(NA, FALSE) %>%
           cumsum) %>%
  count(variable, chr, ID) %>%
  rename(numberOfConsecutive1s = n) %>%
  count(variable, chr, numberOfConsecutive1s) %>%
  spread(variable, n, fill = 0)

但它给了我一个超出范围的指数&#39;错误。如果我忽略了扩散线,我也得到一个奇数输出,所以我不确定这是答案

2 个答案:

答案 0 :(得分:1)

我想你想要这样的东西:

library(dplyr)
library(tidyr)

min_chunk_length = 1

result = 
  data %>%
  rename(chromosome = chr) %>%
  select(chromosome, Sample1:Sample15) %>%
  gather(sample, value, Sample1:Sample15) %>%
  group_by(chromosome, sample) %>%
  mutate(non_zero = value %in% c(1, -1),
         chunk_ID = 
           non_zero %>%
           lag %>%
           `!=`(non_zero) %>%
           plyr::mapvalues(NA, FALSE) %>%
           cumsum) %>%
  filter(non_zero = TRUE) %>%
  group_by(chromosome, sample, chunk_ID) %>%
  mutate(length_of_chunk = n()) %>%
  filter(length_of_chunk > min_chunk_length) %>%
  count(chromosome, sample) %>%
  spread(sample, n, fill = 0)

答案 1 :(得分:1)

<强> REVISED

根据澄清,这种方法使用每个染色体的rle函数来查找连续1或-1的运行,然后table来计算每个值的运行次数。对于没有特定值计数的样本,这会给NA,因此如果有用,代码的最后一行会将NA's转换为0's。最后,您的structure输入似乎存在问题,因为Cytospongex10_SLX.9395.FastSeqK.fq.gz.res的{​​{1}}部分缺少.Names。这会导致所有列名称被移位,最后一列名称为structure,这可能会导致执行中出现问题。

下面的代码为输入数据指定了正确的名称(在NA data.frame中),然后如上所述计算频率。

df

与上述类似,但使用 colnames(data) <- c("chr", "leftPos", "chr.1", "leftPos.1", "Cytospongex10_SLX.9395.FastSeqK.fq.gz.res", "Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8", "Sample9", "Sample10", "Sample11", "Sample12", "Sample13", "Sample14", "Sample15") chr_labels <- sort(unique(data$chr)) sampl_freqs <- data.frame(chr=1, numberOfConsecutive1s=1, count=0) for( sampl in colnames(data)[-(1:5)]) { freqs <- data.frame() for( chr in chr_labels ) { runs <- rle(data[data$chr == chr,sampl]) freqs_chr <- data.frame(chr=chr, table(runs$length[runs$values %in% c(-1,1)], dnn = "numberOfConsecutive1s") ) freqs <- rbind(freqs, freqs_chr) } sampl_freqs <- merge.data.frame(sampl_freqs, freqs, by = c("chr","numberOfConsecutive1s"), all=TRUE) colnames(sampl_freqs) <- c(head(colnames(sampl_freqs),-1),paste("Freq",sampl,sep="")) } # clean up from sampl_freqs definition sampl_freqs <- sampl_freqs[,-3] # To convert NA's to 0 sampl_freqs <- data.frame(sampl_freqs[,1:2], sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x)))

dplyr