我有一个数据集如下:
structure(list(chr = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
1, 0, 0, 1, 1, 1, 1), leftPos = c(240000, 1080000, 1200000, 1320000,
1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 3480000,
3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 5160000,
5280000, 6e+06), chr.1 = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 1, 1, 1), leftPos.1 = c(240000, 1080000, 1200000,
1320000, 1440000, 1800000, 2400000, 2520000, 3120000, 3360000,
3480000, 3600000, 3720000, 4200000, 4560000, 4920000, 5040000,
5160000, 5280000, 6e+06), ASample = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0), Sample1 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1), Sample2 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), Sample3 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1), Sample4 = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample5 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample6 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample7 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Sample8 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample9 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1), Sample10 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample11 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample12 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample13 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), Sample14 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample15 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1)), .Names = c("chr",
"leftPos", "chr.1", "leftPos.1", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15"), row.names = c(NA,
20L), class = "data.frame")
我需要计算每列的行数,其中有多个相同的1或-1
我希望能够计算每列的连续行数,按chr分组,在染色体中有三个连续的1或-1(名为chr
的列)。
理想的输出类似于(不是从上面的输入数据中获取)
chr numberOfConsecutive1s FreqSample1 FreqSample2 FreqSample3 etc
1 2 3 2 14
1 3 5 2 2
1 4 5 0 6
1 5 4 3 5
1 6 3 0 3
1 7 7 5 7
1 8 5 0 2
1 9 54 2 6
1 10 34 77 7
2 2 6 4 2
2 3 23 34 34
2 4 5 37 2
2 5 55 24 22
2 6 2 0 11
2 7 3 14 5
2 8 2 5 77
2 9 5 23 34
2 10 5 11 34
3 1 32 0 2
到目前为止,我已经尝试过以下操作,只需将非连续的1转换为0,所以我只剩下连续的1。我不知道如何根据所需的输出计算它们。
dx<-DAT_list2res
f0 <- function( colNr, dx )
{
col <- dx[,colNr]
n1 <- which( col == 1 ) # The `1`-rows.
d0 <- which( diff(col) == 0 ) # Consecutive entries are equal.
dc0 <- which( diff(dx[,1]) == 0 ) # Same chromosome.
m <- intersect( n1-1, intersect( d0, dc0 ) )
return ( setdiff( 1:nrow(dx), union(m,m+1) ) )
}
g <- function( dx )
{
for ( i in 3:ncol(dx) ) { dx[f0(i,dx),i] <- 0 }
return ( dx )
}
dx<-g(dx)
修改
我也按照bramtayl的建议尝试了这个:
result =
consecFreq %>%
select(-chr) %>%
gather(variable, chr, 5:190) %>%
group_by(variable) %>%
mutate(ID =
chr %>%
lag %>%
`!=`(chr) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
count(variable, chr, ID) %>%
rename(numberOfConsecutive1s = n) %>%
count(variable, chr, numberOfConsecutive1s) %>%
spread(variable, n, fill = 0)
但它给了我一个超出范围的指数&#39;错误。如果我忽略了扩散线,我也得到一个奇数输出,所以我不确定这是答案
答案 0 :(得分:1)
我想你想要这样的东西:
library(dplyr)
library(tidyr)
min_chunk_length = 1
result =
data %>%
rename(chromosome = chr) %>%
select(chromosome, Sample1:Sample15) %>%
gather(sample, value, Sample1:Sample15) %>%
group_by(chromosome, sample) %>%
mutate(non_zero = value %in% c(1, -1),
chunk_ID =
non_zero %>%
lag %>%
`!=`(non_zero) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
filter(non_zero = TRUE) %>%
group_by(chromosome, sample, chunk_ID) %>%
mutate(length_of_chunk = n()) %>%
filter(length_of_chunk > min_chunk_length) %>%
count(chromosome, sample) %>%
spread(sample, n, fill = 0)
答案 1 :(得分:1)
<强> REVISED 强>
根据澄清,这种方法使用每个染色体的rle
函数来查找连续1或-1的运行,然后table
来计算每个值的运行次数。对于没有特定值计数的样本,这会给NA
,因此如果有用,代码的最后一行会将NA's
转换为0's
。最后,您的structure
输入似乎存在问题,因为Cytospongex10_SLX.9395.FastSeqK.fq.gz.res
的{{1}}部分缺少.Names
。这会导致所有列名称被移位,最后一列名称为structure
,这可能会导致执行中出现问题。
下面的代码为输入数据指定了正确的名称(在NA
data.frame
中),然后如上所述计算频率。
df
与上述类似,但使用 colnames(data) <- c("chr",
"leftPos", "chr.1", "leftPos.1", "Cytospongex10_SLX.9395.FastSeqK.fq.gz.res", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15")
chr_labels <- sort(unique(data$chr))
sampl_freqs <- data.frame(chr=1, numberOfConsecutive1s=1, count=0)
for( sampl in colnames(data)[-(1:5)]) {
freqs <- data.frame()
for( chr in chr_labels ) {
runs <- rle(data[data$chr == chr,sampl])
freqs_chr <- data.frame(chr=chr, table(runs$length[runs$values %in% c(-1,1)], dnn = "numberOfConsecutive1s") )
freqs <- rbind(freqs, freqs_chr)
}
sampl_freqs <- merge.data.frame(sampl_freqs, freqs, by = c("chr","numberOfConsecutive1s"), all=TRUE)
colnames(sampl_freqs) <- c(head(colnames(sampl_freqs),-1),paste("Freq",sampl,sep=""))
}
# clean up from sampl_freqs definition
sampl_freqs <- sampl_freqs[,-3]
# To convert NA's to 0
sampl_freqs <- data.frame(sampl_freqs[,1:2], sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x)))
dplyr