我有一个包含Start
和End
网站的数据框,用于划分间隔
sampleID
列表示不同的个人。
示例输入将是:
sampleID Start End
S1 10 20
S2 15 20
S3 5 15
S4 15 25
示例输出是:
Start End sampleIDs count
15 20 S1,S2,S4 3
10 15 S1,S3 2
5 10 S3 1
20 25 S4 1
显示sampleID
之间的最小重叠间隔,按sampleID
s(count
)的数量排列,该间隔包含该间隔。请注意,这些间隔是交叉点而不是联合,即最小重叠。
在R中有没有一种有效的方法呢?
我已经在下面放置了一个更大的输入数据帧,这也不是详尽无遗的,但表明需要可扩展性。
structure(list(sampleID = c("S.A1.A0SB", "S.A1.A0SD", "S.A1.A0SE",
"S.A1.A0SF", "S.A1.A0SH", "S.A1.A0SJ", "S.A1.A0SK", "S.A1.A0SO",
"S.A1.A0SO", "S.A1.A0SO", "S.A1.A0SO", "S.A1.A0SP", "S.A1.A0SP",
"S.A1.A0SP", "S.A1.A0SP", "S.A1.A0SP", "S.A1.A0SP", "S.A1.A0SP",
"S.A1.A0SP", "S.A2.A04N", "S.A2.A04Q", "S.A2.A04U", "S.A2.A04U",
"S.A2.A04U", "S.A2.A04U", "S.A2.A04U", "S.A2.A04U", "S.A2.A04U",
"S.A2.A04U", "S.A2.A04U", "S.A2.A04U", "S.A2.A04V", "S.A2.A04W",
"S.A2.A04Y", "S.A2.A04Y", "S.A2.A04Y", "S.A2.A0CK", "S.A2.A0CL",
"S.A2.A0CL", "S.A2.A0CL", "S.A2.A0CL", "S.A2.A0CL", "S.A2.A0CL",
"S.A2.A0CO", "S.A2.A0CP", "S.A2.A0CU", "S.A2.A0CW", "S.A2.A0CZ",
"S.A2.A0CZ", "S.A2.A0D1", "S.A2.A0D2", "S.A2.A0D2", "S.A2.A0D2",
"S.A2.A0D2", "S.A2.A0D2", "S.A2.A0D3", "S.A2.A0D4", "S.A2.A0EM",
"S.A2.A0EO", "S.A2.A0EO", "S.A2.A0ET", "S.A2.A0EX", "S.A2.A0EX",
"S.A2.A0EX", "S.A2.A0SW", "S.A2.A0SW", "S.A2.A0SX", "S.A2.A0SX",
"S.A2.A0SX", "S.A2.A0SX", "S.A2.A0SX", "S.A2.A0SY", "S.A2.A0T0",
"S.A2.A0T0", "S.A2.A0T0", "S.A2.A0T0", "S.A2.A0T2", "S.A2.A0T3",
"S.A2.A0T3", "S.A2.A0T3", "S.A2.A0T5", "S.A2.A0T5", "S.A2.A0T5",
"S.A2.A0T5", "S.A2.A0T7", "S.A2.A0T7", "S.A2.A0YC", "S.A2.A0YC",
"S.A2.A0YC", "S.A2.A0YD", "S.A2.A0YD", "S.A2.A0YE", "S.A2.A0YE",
"S.A2.A0YF", "S.A2.A0YF", "S.A2.A0YG", "S.A2.A0YH", "S.A2.A0YH",
"S.A2.A0YI", "S.A2.A0YK"), Start = c(61949885L, 14267730L, 155824310L,
61934790L, 45924211L, 102529319L, 162513149L, 51815687L, 80466481L,
116281984L, 123138522L, 60345L, 8866808L, 11707881L, 28154465L,
38352136L, 50457227L, 74874773L, 106301415L, 146302036L, 170198898L,
60345L, 5188432L, 12147403L, 16475012L, 34606495L, 42058455L,
78861145L, 89338676L, 190742772L, 190953557L, 61960972L, 146256066L,
12006772L, 102364297L, 117352205L, 3970428L, 60345L, 55855976L,
140288130L, 143825638L, 152172182L, 193601448L, 3959916L, 141061438L,
182730173L, 85483972L, 48649406L, 117438564L, 171199568L, 60345L,
8933583L, 41810481L, 56447761L, 60041687L, 21999782L, 165040863L,
160272760L, 61960972L, 98726948L, 194106553L, 38102115L, 45117006L,
69922067L, 27068426L, 61964568L, 60345L, 34165785L, 79359090L,
137778574L, 196897088L, 4588788L, 48924900L, 182637122L, 185982713L,
197683775L, 60345L, 60345L, 36157091L, 75901451L, 4588300L, 8896114L,
61960972L, 113218206L, 151910714L, 161570016L, 45731451L, 97773946L,
126685000L, 76434706L, 146256066L, 97773946L, 129775858L, 146257307L,
151910714L, 16263872L, 36154008L, 122011351L, 45734818L, 104890278L
), End = c(61968443L, 14500744L, 155858773L, 61963289L, 70473655L,
102854965L, 162623881L, 53767689L, 80473220L, 123055274L, 126487533L,
8820401L, 11571603L, 28148832L, 38347851L, 48581848L, 72694005L,
89508500L, 106347238L, 146309472L, 171216876L, 3079965L, 9796508L,
14448975L, 25426775L, 40270066L, 78613247L, 86491493L, 89426646L,
190858147L, 190980956L, 61963289L, 146367621L, 12010549L, 102616251L,
119730589L, 3972228L, 55763091L, 117372597L, 140853186L, 149511080L,
157470885L, 197896118L, 3976417L, 141181684L, 182829544L, 85722302L,
52140767L, 117976887L, 171348970L, 8917703L, 41700718L, 56329310L,
59429073L, 151645022L, 22000946L, 165095753L, 160317817L, 61963289L,
98734044L, 194143040L, 39488541L, 45133774L, 73275553L, 61960484L,
62624801L, 33315307L, 75276213L, 102197415L, 137787173L, 197896118L,
4607133L, 52203291L, 185970173L, 190759854L, 197896118L, 93519478L,
36153983L, 75394435L, 87892515L, 4607133L, 9089254L, 61963289L,
113257213L, 152987815L, 162447684L, 45763977L, 97787341L, 128128083L,
76464137L, 146277132L, 97787849L, 129806236L, 146260941L, 151978186L,
16389026L, 36156863L, 122022417L, 45765778L, 104987434L), length = c(18558L,
233014L, 34463L, 28499L, 24549444L, 325646L, 110732L, 1952002L,
6739L, 6773290L, 3349011L, 8760056L, 2704795L, 16440951L, 10193386L,
10229712L, 22236778L, 14633727L, 45823L, 7436L, 1017978L, 3019620L,
4608076L, 2301572L, 8951763L, 5663571L, 36554792L, 7630348L,
87970L, 115375L, 27399L, 2317L, 111555L, 3777L, 251954L, 2378384L,
1800L, 55702746L, 61516621L, 565056L, 5685442L, 5298703L, 4294670L,
16501L, 120246L, 99371L, 238330L, 3491361L, 538323L, 149402L,
8857358L, 32767135L, 14518829L, 2981312L, 91603335L, 1164L, 54890L,
45057L, 2317L, 7096L, 36487L, 1386426L, 16768L, 3353486L, 34892058L,
660233L, 33254962L, 41110428L, 22838325L, 8599L, 999030L, 18345L,
3278391L, 3333051L, 4777141L, 212343L, 93459133L, 36093638L,
39237344L, 11991064L, 18833L, 193140L, 2317L, 39007L, 1077101L,
877668L, 32526L, 13395L, 1443083L, 29431L, 21066L, 13903L, 30378L,
3634L, 67472L, 125154L, 2855L, 11066L, 30960L, 97156L)), .Names = c("sampleID",
"Start", "End", "length"), row.names = c(18L, 130L, 252L, 420L,
707L, 921L, 1310L, 2173L, 2181L, 2191L, 2193L, 2585L, 2587L,
2592L, 2594L, 2596L, 2598L, 2600L, 2602L, 2762L, 3217L, 3896L,
3898L, 3901L, 3903L, 3905L, 3911L, 3913L, 3915L, 3940L, 3942L,
4422L, 4647L, 5131L, 5135L, 5137L, 5336L, 5479L, 5481L, 5488L,
5492L, 5498L, 5500L, 6080L, 6178L, 6749L, 7529L, 8218L, 8224L,
8924L, 9198L, 9200L, 9202L, 9204L, 9206L, 9487L, 9652L, 9825L,
10010L, 10012L, 10839L, 11487L, 11489L, 11491L, 12297L, 12299L,
12445L, 12447L, 12450L, 12452L, 12469L, 12650L, 12786L, 12794L,
12796L, 12798L, 13317L, 13510L, 13512L, 13514L, 13964L, 13968L,
13976L, 13978L, 14370L, 14372L, 14573L, 14577L, 14583L, 14956L,
14958L, 15084L, 15086L, 15296L, 15298L, 15495L, 15753L, 15755L,
15934L, 16343L), class = "data.frame")
答案 0 :(得分:1)
这是一种使用交叉连接的方法。这是非常低效的。
library(dplyr)
times =
data %>% select(time = Start) %>%
bind_rows(data %>% select(time = End)) %>%
distinct %>%
arrange(time)
# create a to-from table
envelopes =
times %>%
rename(start_time.envelope = time) %>%
slice(-n()) %>%
bind_cols(times %>%
rename(end_time.envelope = time) %>%
slice(-1)) %>%
mutate(envelope_ID = 1:n())
# cross join
join_table =
data %>%
merge(envelopes) %>%
filter(pmax(Start, start_time.envelope) <
pmin(End, end_time.envelope) )
# summarize
summary =
join_table %>%
group_by(envelope_ID) %>%
summarize(sampleIDs = sampleID %>% paste(collapse = ";"),
n = n()) %>%
left_join(envelopes)
答案 1 :(得分:1)
我没有使用data.table的经验,但是您可以尝试建立这些经验:
library(data.table)
data <- fread("sampleID Start End
S1 10 20
S2 15 20
S3 5 15
S4 15 25")
setkey(data, Start, End)
startsEnds <- data.table(Start = head(sort(unique(c(data$Start, data$End))), -1),
End = tail(sort(unique(c(data$Start, data$End))), -1))
(dt <- foverlaps(startsEnds, data, type="within")[,c(.(sampleIDs=lapply(.SD, paste, collapse=",")), count=.N), by=.(Start=i.Start, End=i.End), .SDcols="sampleID"][order(-count)])
# Start End sampleIDs count
# 1: 15 20 S1,S2,S4 3
# 2: 10 15 S3,S1 2
# 3: 5 10 S3 1
# 4: 20 25 S4 1
您可能想要调整startsEnds
表的构造方式。