是否有一个工具可以在R中进行以下合并?

时间:2014-09-16 15:53:17

标签: r merge

我正在处理间隔:

chr1:004336501-004336560   0.3437   
chr1:004340069-004340128   0.3437   
chr1:004350335-004350394   0.3437   
chr1:004354213-004354272   0.3218   
chr1:004380332-004380391   0.3218   
chr1:004481060-004481119   0.3218   
chr1:004488728-004488787   0.3607   
...

我想得到以下内容:

chr1  004336501  004350394  0.3437
chr1  004354213  004481119  0.3218
...

如果没有R功能,我会感到惊讶。我不想在R中使用循环,因为文件很大。我很感激任何可以发展的建议。

谢谢!

4 个答案:

答案 0 :(得分:5)

您也可以尝试:

 library(data.table)
 library(devtools)
 source_gist(11380733)

 #Updated based on @Ananda Mahto's comments
 DT <- cSplit(df, "V1", "[:-]", fixed = FALSE)[,
          list(chr = V1_1[1], First = V1_2[1], Last = V1_3[.N]), by = V2]
 setkey(DT,V2)

 DT
 #      V2  chr     First     Last
 #1: 0.3218 chr1 004354213 004481119
 #2: 0.3437 chr1 004336501 004350394
 #3: 0.3607 chr1 004488728 004488787

或使用regex将多个分隔符更改为单个分隔符。

 DT1 <- cSplit(transform(df, V1=gsub(":", "-", V1)),
            "V1", "-")[,list(Chr=V1_1[1], ColN1=V1_2[1], ColN2=V1_3[.N]), by=V2]
 setkey(DT1, V2)
  DT1
  #      V2  Chr     ColN1     ColN2
  #1: 0.3218 chr1 004354213 004481119
  #2: 0.3437 chr1 004336501 004350394
  #3: 0.3607 chr1 004488728 004488787

数据

 df <- structure(list(V1 = c("chr1:004336501-004336560", "chr1:004340069-004340128", 
 "chr1:004350335-004350394", "chr1:004354213-004354272", "chr1:004380332-004380391", 
 "chr1:004481060-004481119", "chr1:004488728-004488787"), V2 = c(0.3437, 
 0.3437, 0.3437, 0.3218, 0.3218, 0.3218, 0.3607)), .Names = c("V1", 
 "V2"), class = "data.frame", row.names = c(NA, -7L))

答案 1 :(得分:4)

除非你的矩阵子集技能疯狂,否则我没有看到没有循环的方法。

如果您的原始数据为df

,请按照以下方式执行此操作
> s <- split(strsplit(df$V1, "[:-]"), df$V2)
> cb <- cbind(Reduce(rbind, lapply(s, function(x) {
          x <- do.call(rbind, x)
          c(x[1,1], x[1,2], x[nrow(x),3])
      })), names(s))
> data.frame(unname(cb))
#     X1        X2        X3     X4
# 1 chr1 004354213 004481119 0.3218
# 2 chr1 004336501 004350394 0.3437
# 3 chr1 004488728 004488787 0.3607

答案 2 :(得分:3)

这是基于dplyr和tidyr

的想法
library(dplyr)
library(tidyr)
> dat
                       V1     V2
    1 004336501-004336560 0.3437
    2 004340069-004340128 0.3437
    3 004350335-004350394 0.3437
    4 004354213-004354272 0.3218
    5 004380332-004380391 0.3218
    6 004481060-004481119 0.3218
    7 004488728-004488787 0.3607

dat %>% separate(V1, c("V1a", "V1b")) %>% group_by(V2) %>% summarise(V1a=min(V1a), V1b=max(V1b)) %>% unite(V1, V1a, V1b, sep="-")

      V2                  V1
1 0.3218 004354213-004481119
2 0.3437 004336501-004350394
3 0.3607 004488728-004488787

答案 3 :(得分:1)

在实现了nrussel的观点后,我的最新答案。我相信现在应该适当地进行以下工作,尽管涉及的功能有些混乱。

> dat
                        V1     V2
1 chr1:004336501-004336560 0.3437
2 chr1:004340069-004340128 0.3437
3 chr1:004350335-004350394 0.3437
4 chr1:004354213-004354272 0.3218
5 chr1:004380332-004380391 0.3218
6 chr1:004481060-004481119 0.3218
7 chr1:004488728-004488787 0.3607

require(stringr)
# Split data based upon V2
dat_split <- split(dat, f = as.factor(dat[,2]))

# function to find matches (which appear to be the lowest and highest values)
find_matches <- function(dat_split){
  x <- str_split_fixed(dat_split[,1], "[//:|//-]", 3)
  out <- c(x[1,1], min(x[,2]), max(x[,3]), unique(dat_split[,2]))
  return(out)
}

out <- do.call(rbind, lapply(dat_split, FUN = function(x) find_matches(x)))
rownames(out) <- NULL

> out
     [,1]   [,2]        [,3]        [,4]    
[1,] "chr1" "004354213" "004481119" "0.3218"
[2,] "chr1" "004336501" "004350394" "0.3437"
[3,] "chr1" "004488728" "004488787" "0.3607"