我正在处理间隔:
chr1:004336501-004336560 0.3437
chr1:004340069-004340128 0.3437
chr1:004350335-004350394 0.3437
chr1:004354213-004354272 0.3218
chr1:004380332-004380391 0.3218
chr1:004481060-004481119 0.3218
chr1:004488728-004488787 0.3607
...
我想得到以下内容:
chr1 004336501 004350394 0.3437
chr1 004354213 004481119 0.3218
...
如果没有R功能,我会感到惊讶。我不想在R中使用循环,因为文件很大。我很感激任何可以发展的建议。
谢谢!
答案 0 :(得分:5)
您也可以尝试:
library(data.table)
library(devtools)
source_gist(11380733)
#Updated based on @Ananda Mahto's comments
DT <- cSplit(df, "V1", "[:-]", fixed = FALSE)[,
list(chr = V1_1[1], First = V1_2[1], Last = V1_3[.N]), by = V2]
setkey(DT,V2)
DT
# V2 chr First Last
#1: 0.3218 chr1 004354213 004481119
#2: 0.3437 chr1 004336501 004350394
#3: 0.3607 chr1 004488728 004488787
或使用regex
将多个分隔符更改为单个分隔符。
DT1 <- cSplit(transform(df, V1=gsub(":", "-", V1)),
"V1", "-")[,list(Chr=V1_1[1], ColN1=V1_2[1], ColN2=V1_3[.N]), by=V2]
setkey(DT1, V2)
DT1
# V2 Chr ColN1 ColN2
#1: 0.3218 chr1 004354213 004481119
#2: 0.3437 chr1 004336501 004350394
#3: 0.3607 chr1 004488728 004488787
df <- structure(list(V1 = c("chr1:004336501-004336560", "chr1:004340069-004340128",
"chr1:004350335-004350394", "chr1:004354213-004354272", "chr1:004380332-004380391",
"chr1:004481060-004481119", "chr1:004488728-004488787"), V2 = c(0.3437,
0.3437, 0.3437, 0.3218, 0.3218, 0.3218, 0.3607)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -7L))
答案 1 :(得分:4)
除非你的矩阵子集技能疯狂,否则我没有看到没有循环的方法。
如果您的原始数据为df
> s <- split(strsplit(df$V1, "[:-]"), df$V2)
> cb <- cbind(Reduce(rbind, lapply(s, function(x) {
x <- do.call(rbind, x)
c(x[1,1], x[1,2], x[nrow(x),3])
})), names(s))
> data.frame(unname(cb))
# X1 X2 X3 X4
# 1 chr1 004354213 004481119 0.3218
# 2 chr1 004336501 004350394 0.3437
# 3 chr1 004488728 004488787 0.3607
答案 2 :(得分:3)
这是基于dplyr和tidyr
的想法library(dplyr)
library(tidyr)
> dat
V1 V2
1 004336501-004336560 0.3437
2 004340069-004340128 0.3437
3 004350335-004350394 0.3437
4 004354213-004354272 0.3218
5 004380332-004380391 0.3218
6 004481060-004481119 0.3218
7 004488728-004488787 0.3607
dat %>% separate(V1, c("V1a", "V1b")) %>% group_by(V2) %>% summarise(V1a=min(V1a), V1b=max(V1b)) %>% unite(V1, V1a, V1b, sep="-")
V2 V1
1 0.3218 004354213-004481119
2 0.3437 004336501-004350394
3 0.3607 004488728-004488787
答案 3 :(得分:1)
在实现了nrussel的观点后,我的最新答案。我相信现在应该适当地进行以下工作,尽管涉及的功能有些混乱。
> dat
V1 V2
1 chr1:004336501-004336560 0.3437
2 chr1:004340069-004340128 0.3437
3 chr1:004350335-004350394 0.3437
4 chr1:004354213-004354272 0.3218
5 chr1:004380332-004380391 0.3218
6 chr1:004481060-004481119 0.3218
7 chr1:004488728-004488787 0.3607
require(stringr)
# Split data based upon V2
dat_split <- split(dat, f = as.factor(dat[,2]))
# function to find matches (which appear to be the lowest and highest values)
find_matches <- function(dat_split){
x <- str_split_fixed(dat_split[,1], "[//:|//-]", 3)
out <- c(x[1,1], min(x[,2]), max(x[,3]), unique(dat_split[,2]))
return(out)
}
out <- do.call(rbind, lapply(dat_split, FUN = function(x) find_matches(x)))
rownames(out) <- NULL
> out
[,1] [,2] [,3] [,4]
[1,] "chr1" "004354213" "004481119" "0.3218"
[2,] "chr1" "004336501" "004350394" "0.3437"
[3,] "chr1" "004488728" "004488787" "0.3607"