如何根据数据框中的组计算行与级别数之间的差异

时间:2014-04-08 19:49:56

标签: r

我有一个很大的数据框架。我想计算"pos"列中上一行的行之间的差异,仅当列等于1时处理行(例如df$X1x8 == 1),然后找出最大值"chr"列和nlevels(df$bin)的每个级别的差异。我需要R专家来解决我的问题。欣赏它。 示例:如果在colum X1x8上工作(第4列到第8列只能得到0或1)

df = read.table(text=" mi   chr pos X1x3    X1x4    X1x8    X2x3    X2x4    sum bin
S000001 1   12578   0   1   1   0   1   3   1
S000003 1   96483   0   0   1   0   0   1   2
S000007 1   238450  1   0   1   1   0   3   3
S000010 1   298018  0   0   1   0   0   1   3
S000015 1   471895  0   1   0   1   1   3   4
S000022 1   704591  0   0   1   0   0   1   5
S000023 1   757938  1   0   1   1   1   4   5
S000002 2   47285   0   0   1   0   1   2   6
S000005 2   145243  0   1   1   0   0   2   6
S000009 2   246104  1   0   1   1   0   3   7
S000011 2   319046  0   1   1   0   0   2   7
S000018 2   566163  1   0   0   1   1   3   9", header=T, stringsAsFactors=F)

df <- df[order(df$chr,df$pos),] # to make sure entries are in right order
dat <- df[!(df$X1x8==0),] # only calculate df$column == 1 rows
dat = dat[,c(1:3,6,10)] # is just example for working column 6
bin <- nlevels(as.factor(dat$bin))
# the first dat$pos of each level of "chr" should give to the first dat$diff, then the rest of values are filled with the following code. 
data.frame(diff(as.matrix(dat$pos))) 
max(dat$diff)

 my expect the result looks like:
#chr  1x3   1x4 1x8 2x3 2x4 
#gab    1   519488  459317  406573  286043  459317 
#bin    1   2   2   4   3   3 
#gab    2   320059  173803  100861  320059 518878 
#bin    2   2   2   2   2   2

1 个答案:

答案 0 :(得分:1)

编辑:

所以你提供了实际的预期输出(我已经更新了你的问题),所以这里是更新后的代码

dat <- split(df, df$chr)
vars <- grep("X", names(df), value = T)
dat2 <- data.frame(gap = numeric(length(unique(df$chr))*length(vars)) , bin = numeric(length(unique(df$chr))*length(vars)))
k <- 0
for (i in 1:length(dat)){
  for(j in vars){
    k <- k + 1
    dat2$gap[k] <- max(diff(dat[[as.character(i)]][dat[[as.character(i)]][j] == 1, "pos"]))
    dat2$bin[k] <- nlevels(as.factor(dat[[as.character(i)]][dat[[as.character(i)]][j] == 1, "bin"]))
  }
}
dat3 <- data.frame(chr = paste(c("gab", "bin"), rep(unique(df$chr), each = 2)))
dat3 <- as.data.frame(cbind(dat3, t(do.call(cbind, split(dat2, gl(length(unique(df$chr)), length(vars)))))))
names(dat3) <- c("chr", gsub("^X", "", vars))
row.names(dat3) <- NULL
dat3
##    chr    1x3    1x4    1x8    2x3    2x4
##1 gab 1 519488 459317 406573 286043 459317
##2 bin 1      2      2      4      3      3
##3 gab 2 320059 173803 100861 320059 518878
##4 bin 2      2      2      2      2      2