Question

我的数据框如下：

     chr   leftPos        ZScore1    ZScore2    ZScore3    ZScore4
      1     24352           34         43          19         43
      1     53534           2          1           -1         -9
      2      34            -15         7           -9         -18
      3     3443           -100        -4          4          -9
      3     3445           -100        -1          6          -1
      3     3667            5          -5          9           5
      3      7882          -8          -9          1           3

我想只保留那些具有相同chr且具有ZScore的相邻列的行在同一方向上的行。换句话说，如果该chr之前或之后的行具有相同的符号（正或负），则应该保留一行。我想在列名中运行ZS的所有列，以便输出最终只是满足每行标准的行数。

对于一列，代码应该导致：

     chr   leftPos         ZScore
      1     24352           34
      1     53534           2
      3     3443           -100
      3     3445           -100

但最终输出应该是

         ZScore1    ZScore2    ZScore3    ZScore4
nrow        4         6          4          4


 I have tried bits of code but Im not even really sure how to approach this.

我想我会按照chr进行分组，然后查看上面的行是否与当前行的正面或负面相同，然后查看下面的行是否与当前行的方向相同。然后移动到该chr的下一行。

Answer 1

使用包dplyr

尝试此操作

library(dplyr)

<强> 数据

df <- data.frame(chr=c(1, 1, 2, 3, 3, 3, 3),
             leftPos=c(24352, 53534, 34, 3443, 3445, 3667, 7882),
             ZScore=c(34, 2, -15, -100, -100, 5, -8))

<强> 代码

df %>% group_by(chr) %>% 
   filter(sign(ZScore)==sign(lag(ZScore)) | sign(ZScore)==sign(lead(ZScore))) %>% 
   ungroup

Answer 2

这是一个可能的data.table解决方案，它使用dev version

中的rleid

setDT(df)[, indx := .N, by = .(chr, rleid(sign(ZScore)))][indx > 1L]
#    chr leftPos ZScore indx
# 1:   1   24352     34    2
# 2:   1   53534      2    2
# 3:   3    3443   -100    2
# 4:   3    3445   -100    2

修改（根据新数据）

indx <- paste0('ZScore', 1:4)
temp <- setDT(df)[, lapply(.SD, function(x) rleid(sign(x))), .SDcols = indx, by = chr]

Res <- setNames(numeric(length(indx)), indx)
for (i in indx) Res[i] <- length(temp[, .I[.N > 1L], by = c("chr", i)]$V1)
Res
# ZScore1 ZScore2 ZScore3 ZScore4 
#       4       6       4       4

Answer 3

使用devel版本data.table的选项（类似于@dimitris_ps帖子中的方法）。安装devel版本的说明是here

library(data.table)#v1.9.5
na.omit(setDT(df)[, {tmp= sign(ZScore)
  .SD[tmp==shift(tmp) | tmp==shift(tmp, type='lead')] },
             by=chr])
#     chr leftPos ZScore
#1:   1   24352     34
#2:   1   53534      2
#3:   3    3443   -100
#4:   3    3445   -100

更新

我们可以创建一个函数

 f1 <- function(dat, ZCol){
    na.omit(as.data.table(dat)[, {tmp = sign(eval(as.name(ZCol)))
     .SD[tmp==shift(tmp) | tmp==shift(tmp, type='lead')]},
    by=chr])[, list(.N)]}

 nm1 <- paste0('ZScore', 1:4)
 setnames(do.call(cbind,lapply(nm1, function(x) f1(df1, x))), nm1)[]
 #   ZScore1 ZScore2 ZScore3 ZScore4
 #1:       4       6       4       4

或者我们可以使用set

 res <- as.data.table(matrix(0, ncol=4, nrow=1, 
                  dimnames=list(NULL, nm1)))
 for(j in seq_along(nm1)){
   set(res, i=NULL, j=j, value=f1(df1,nm1[j]))
  }
 res
 #   ZScore1 ZScore2 ZScore3 ZScore4
 #1:       4       6       4       4

数据

df <- structure(list(chr = c(1L, 1L, 2L, 3L, 3L, 3L, 3L),
leftPos = c(24352L, 
53534L, 34L, 3443L, 3445L, 3667L, 7882L), ZScore = c(34L, 2L, 
-15L, -100L, -100L, 5L, -8L)), .Names = c("chr", "leftPos", "ZScore"
), class = "data.frame", row.names = c(NA, -7L))

 df1 <- structure(list(chr = c(1L, 1L, 2L, 3L, 3L, 3L, 3L),
 leftPos = c(24352L, 
 53534L, 34L, 3443L, 3445L, 3667L, 7882L), ZScore1 = c(34L, 2L, 
 -15L, -100L, -100L, 5L, -8L), ZScore2 = c(43L, 1L, 7L, -4L, -1L, 
 -5L, -9L), ZScore3 = c(19L, -1L, -9L, 4L, 6L, 9L, 1L),
 ZScore4 = c(43L, 
 -9L, -18L, -9L, -1L, 5L, 3L)), .Names = c("chr", "leftPos",
  "ZScore1", "ZScore2", "ZScore3", "ZScore4"), class = "data.frame",
  row.names = c(NA, -7L))

Answer 4

这是一个完成你想要的循环。没有奇特的包装。只需检查后面的行和前面的行是否匹配 - 如果它们匹配，则继续，否则，剥去行并检查相同的位置。

chr = c(1,1,2,3,3,3,3,3)
mat = cbind(chr,rnorm(8))

i = 1
while(i <= nrow(mat)){
  if (mat[max(i-1,1),1] != mat[i,1] & mat[min(nrow(mat),i+1),1] != mat[i,1]){
    mat = mat[-i,]
  } else {
    i = i+1
  }
}

如何根据上面或下面的行条件删除行

4 个答案:

更新

数据