我正在以这种格式处理基因组数据:
chr start end lengthabs_summit pileup X.log10.pvalue. fold_enrichment X.log10.qvalue. name
chr1 29017 29389 373 29358 31 28.59002 11.7551 24.95703 7_peak_2
chr1 569886 569978 93 569924 1334 334.59555 3.66639 329.13641 7_peak_13
chr1 713775 714591 817 714238 63 57.55214 14.98049 53.18887 7_peak_16
chr1 1009170 1009766 597 1009354 57 29.6026 6.49704 25.93788 7_peak_38
chr1 1013682 1014753 1072 1014285 45 22.68048 6.00323 19.24049 7_peak_39
chr1 1051283 1052033 751 1051691 49 34.32018 9.31181 30.51424 7_peak_43
chr1 1071957 1072489 533 1072064 36 20.45083 6.56582 17.09022 7_peak_46
chr1 1079500 1080408 909 1079994 36 21.25546 6.87813 17.8657 7_peak_47
chr1 1085553 1085793 241 1085681 32 20.59002 7.39226 17.22433 7_peak_48
chr1 1092859 1093875 1017 1092953 55 32.86424 7.69885 29.10045 7_peak_49
chr1 1098076 1098442 367 1098157 51 25.19468 6.00704 21.67023 7_peak_50
chr1 1167340 1167771 432 1167457 46 34.94157 10.2791 31.11741 7_peak_57
chr1 1310568 1311013 446 1310739 75 61.06957 12.93319 56.63967 7_peak_73
chr1 1334658 1335005 348 1334903 41 32.4828 10.54771 28.73031 7_peak_74
chr1 1368673 1368922 250 1368819 39 20.83713 6.22806 17.46213 7_peak_77
chr1 1407006 1407170 165 1407136 29 23.68931 9.70474 20.21472 7_peak_81
chr1 1446997 1447660 664 1447477 35 25.84261 9.0858 22.29687 7_peak_83
chr1 1550552 1551647 1096 1550765 42 27.55648 8.18824 23.95619 7_peak_87
chr1 1562564 1563038 475 1562809 45 27.52078 7.59892 23.92145 7_peak_88
chr1 1623807 1625030 1224 1624276 59 40.35566 9.39971 36.38159 7_peak_96
chr1 1655573 1656140 568 1655902 44 38.03923 12.27166 34.12801 7_peak_98
chr1 1677697 1678421 725 1677814 46 30.71495 8.58012 27.01606 7_peak_101
chr1 1690209 1690798 590 1690462 55 37.97549 9.38048 34.06614 7_peak_102
chr1 1850605 1851273 669 1850915 58 30.82379 6.7014 27.12157 7_peak_108
chr1 1981599 1982178 580 1981750 44 29.74246 8.62567 26.07388 7_peak_109
chr1 2121014 2121503 490 2121181 44 25.97852 7.22808 22.42829 7_peak_115
chr1 2130779 2131029 251 2130922 57 30.68925 6.78891 26.99122 7_peak_118
chr1 2158733 2159503 771 2159309 52 35.02846 8.9443 31.2017 7_peak_123
chr1 2322758 2323284 527 2323118 47 34.27391 9.75263 30.46929 7_peak_129
chr1 2343877 2344464 588 2344122 45 23.81217 6.35414 20.33326 7_peak_131
chr1 2457479 2458104 626 2457738 41 27.63569 8.43239 24.03328 7_peak_136
chr1 2507171 2507610 440 2507387 40 22.07389 6.50842 18.65457 7_peak_141
chr1 2517776 2518527 752 2517982 79 54.66531 10.1156 50.35995 7_peak_144
chr1 3104749 3105340 592 3105042 39 26.23199 8.29302 22.67383 7_peak_168
chr1 3339907 3340297 391 3340051 61 47.4887 11.4835 43.33681 7_peak_183
chr1 3541145 3541844 700 3541432 33 22.2239 7.90376 18.79962 7_peak_194
chr1 3712982 3713209 228 3713146 25 21.03679 9.46547 17.65467 7_peak_204
chr1 3773318 3774375 1058 3773903 71 64.18323 15.20667 59.69656 7_peak_206
chr1 3816748 3818236 1489 3817402 58 40.40163 9.61359 36.42624 7_peak_210
chr1 6052087 6052758 672 6052606 55 44.57815 11.90162 40.49594 7_peak_218
chr1 6086130 6086460 331 6086283 26 21.8022 9.58904 18.39271 7_peak_220
chr1 6259449 6259894 446 6259711 48 42.85861 13.27342 38.81911 7_peak_223
chr1 6453259 6454267 1009 6453833 36 25.9895 8.89626 22.43882 7_peak_236
chr1 6639866 6640271 406 6640031 44 35.75193 11.19049 31.90473 7_peak_243
chr1 6673060 6674146 1087 6673629 61 46.69005 11.1878 42.55659 7_peak_248
chr1 6844434 6845552 1119 6845378 72 58.2036 12.66662 53.8277 7_peak_252
chr1 6882651 6882812 162 6882746 21 22.99598 11.9154 19.54511 7_peak_255
chr1 7325838 7326444 607 7326032 32 24.22423 9.10923 20.73225 7_peak_258
chr1 7338199 7338451 253 7338410 23 20.28285 9.65393 16.92857 7_peak_259
chr1 7843899 7844833 935 7844068 50 38.84025 10.87309 34.90662 7_peak_266
chr1 7945594 7945913 320 7945805 37 40.12659 16.04772 36.15866 7_peak_267
chr1 8013883 8014418 536 8014328 29 24.7682 10.29467 21.25742 7_peak_269
chr1 8021299 8021991 693 8021619 78 76.90004 18.15693 72.21448 7_peak_270
chr1 8763179 8763705 527 8763447 45 41.29927 13.54395 37.30036 7_peak_297
chr1 8877609 8877845 237 8877792 24 20.69754 9.58204 17.32788 7_peak_299
chr1 9222907 9223400 494 9223017 44 30.50605 8.92885 26.81356 7_peak_310
chr1 9294465 9295131 667 9294997 34 23.79729 8.38562 20.31876 7_peak_316
chr1 9488859 9489215 357 9489096 33 35.37181 14.91643 31.53497 7_peak_323
chr1 9599244 9600007 764 9599346 38 30.08358 10.27689 26.40452 7_peak_325
重要列是chr,start和end。对于每个染色体,我想每100kb制作一个箱子,根据起始位置将每一行装入一个箱子中,然后计算每个箱子中的出现次数,以比较样本之间的分布。
我无法定义垃圾箱。我已经看到“切割”非常适用于此,但由于我没有定义切割点,并且每个染色体都有所不同,我不确定是否是合适的命令。
bin_size = 100000
for (x in levels(df$chr)) { # For each chromosome
number_groups = max(df$end)/bin_size # Number of bins
# How to use cut here?
}
答案 0 :(得分:0)
修订建议: 数据:
test<-structure(list(chr = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1", "chr1"), start = c(29017L,
569886L, 713775L, 1009170L, 1013682L, 1051283L, 1071957L, 1079500L,
1085553L, 1092859L, 1098076L, 1167340L, 1310568L, 1334658L, 1368673L,
1407006L, 1446997L, 1550552L, 1562564L, 1623807L, 1655573L, 1677697L,
1690209L, 1850605L, 1981599L, 2121014L, 2130779L, 2158733L, 2322758L,
2343877L, 2457479L, 2507171L, 2517776L, 3104749L, 3339907L, 3541145L,
3712982L, 3773318L, 3816748L, 6052087L, 6086130L, 6259449L, 6453259L,
6639866L, 6673060L, 6844434L, 6882651L, 7325838L, 7338199L, 7843899L,
7945594L, 8013883L, 8021299L, 8763179L, 8877609L, 9222907L, 9294465L,
9488859L, 9599244L), end = c(29389L, 569978L, 714591L, 1009766L,
1014753L, 1052033L, 1072489L, 1080408L, 1085793L, 1093875L, 1098442L,
1167771L, 1311013L, 1335005L, 1368922L, 1407170L, 1447660L, 1551647L,
1563038L, 1625030L, 1656140L, 1678421L, 1690798L, 1851273L, 1982178L,
2121503L, 2131029L, 2159503L, 2323284L, 2344464L, 2458104L, 2507610L,
2518527L, 3105340L, 3340297L, 3541844L, 3713209L, 3774375L, 3818236L,
6052758L, 6086460L, 6259894L, 6454267L, 6640271L, 6674146L, 6845552L,
6882812L, 7326444L, 7338451L, 7844833L, 7945913L, 8014418L, 8021991L,
8763705L, 8877845L, 9223400L, 9295131L, 9489215L, 9600007L)), .Names = c("chr",
"start", "end"), row.names = c(NA, -59L), class = "data.frame")
使用seq获取间隔向量:
bins_kb=seq(from=0,to=250000,by=100)#chr 1..this could be generalized for all chr.
使用mutate + cut获取间隔:
df_w_intervals<-test%>%mutate(start_kb=start/1000)%>%mutate(bins = cut(start_kb, bins_kb))
head(df_w_intervals)
chr start end start_kb bins
1 chr1 29017 29389 29.017 (0,100]
2 chr1 569886 569978 569.886 (500,600]
3 chr1 713775 714591 713.775 (700,800]
4 chr1 1009170 1009766 1009.170 (1000,1100]
5 chr1 1013682 1014753 1013.682 (1000,1100]
6 chr1 1051283 1052033 1051.283 (1000,1100]
答案 1 :(得分:0)
您可以使用columnNameLength
。使用thisisg中的数据:
0
然后可以在结果中更改名称。 aggregate
是此处的计数,因为这是公式中aggregate(end ~ chr + start%/%100000, data=test, FUN=length)
## chr start%/%1e+05 end
## 1 chr1 0 1
## 2 chr1 5 1
## 3 chr1 7 1
## 4 chr1 10 8
...
左侧的名称。任何列都可以,因为我们只是使用end
来计算元素的数量。