我有一个包含以下模式的大文件(在文件中重复了几千次):
#
# Output from 'compseq'
# The Expected frequencies are calculated on the (false) assumption that every
# word has equal frequency.
#
# The input sequences are:
# s21_contig00001
Word size 4
Total count 49466
#
# Word Obs Count Obs Frequency Exp Frequency Obs/Exp Frequency
#
AAAA 573 0.0115837 0.0039062 2.9654308
AAAC 301 0.0060850 0.0039062 1.5577568
AAAG 305 0.0061659 0.0039062 1.5784579
AAAT 345 0.0069745 0.0039062 1.7854688
AACA 227 0.0045890 0.0039062 1.1747867
AACC 113 0.0022844 0.0039062 0.5848057
AACG 321 0.0064893 0.0039062 1.6612623
AACT 109 0.0022035 0.0039062 0.5641046
AAGA 222 0.0044879 0.0039062 1.1489104
AAGC 339 0.0068532 0.0039062 1.7544172
AAGG 196 0.0039623 0.0039062 1.0143533
AAGT 169 0.0034165 0.0039062 0.8746210
AATA 129 0.0026079 0.0039062 0.6676101
AATC 226 0.0045688 0.0039062 1.1696115
AATG 286 0.0057817 0.0039062 1.4801278
AATT 196 0.0039623 0.0039062 1.0143533
ACAA 211 0.0042656 0.0039062 1.0919824
ACAC 91 0.0018396 0.0039062 0.4709497
ACAG 103 0.0020822 0.0039062 0.5330530
ACAT 167 0.0033761 0.0039062 0.8642704
ACCA 80 0.0016173 0.0039062 0.4140218
ACCC 72 0.0014555 0.0039062 0.3726196
ACCG 217 0.0043869 0.0039062 1.1230340
ACCT 52 0.0010512 0.0039062 0.2691141
ACGA 322 0.0065095 0.0039062 1.6664376
ACGC 201 0.0040634 0.0039062 1.0402297
ACGG 252 0.0050944 0.0039062 1.3041685
ACGT 202 0.0040836 0.0039062 1.0454049
ACTA 35 0.0007076 0.0039062 0.1811345
ACTC 75 0.0015162 0.0039062 0.3881454
ACTG 87 0.0017588 0.0039062 0.4502487
ACTT 169 0.0034165 0.0039062 0.8746210
AGAA 158 0.0031941 0.0039062 0.8176930
AGAC 91 0.0018396 0.0039062 0.4709497
AGAG 100 0.0020216 0.0039062 0.5175272
AGAT 84 0.0016981 0.0039062 0.4347228
AGCA 230 0.0046497 0.0039062 1.1903125
AGCC 185 0.0037399 0.0039062 0.9574253
AGCG 350 0.0070756 0.0039062 1.8113452
AGCT 218 0.0044071 0.0039062 1.1282093
AGGA 144 0.0029111 0.0039062 0.7452392
AGGC 148 0.0029920 0.0039062 0.7659402
AGGG 109 0.0022035 0.0039062 0.5641046
AGGT 52 0.0010512 0.0039062 0.2691141
AGTA 64 0.0012938 0.0039062 0.3312174
AGTC 88 0.0017790 0.0039062 0.4554239
AGTG 105 0.0021227 0.0039062 0.5434035
AGTT 109 0.0022035 0.0039062 0.5641046
ATAA 136 0.0027494 0.0039062 0.7038370
ATAC 100 0.0020216 0.0039062 0.5175272
ATAG 64 0.0012938 0.0039062 0.3312174
ATAT 154 0.0031132 0.0039062 0.7969919
ATCA 242 0.0048922 0.0039062 1.2524158
ATCC 172 0.0034771 0.0039062 0.8901468
ATCG 431 0.0087131 0.0039062 2.2305422
ATCT 84 0.0016981 0.0039062 0.4347228
ATGA 311 0.0062871 0.0039062 1.6095096
ATGC 230 0.0046497 0.0039062 1.1903125
ATGG 213 0.0043060 0.0039062 1.1023329
ATGT 167 0.0033761 0.0039062 0.8642704
ATTA 110 0.0022237 0.0039062 0.5692799
ATTC 166 0.0033558 0.0039062 0.8590951
ATTG 216 0.0043666 0.0039062 1.1178587
ATTT 345 0.0069745 0.0039062 1.7854688
CAAA 392 0.0079246 0.0039062 2.0287066
CAAC 206 0.0041645 0.0039062 1.0661060
CAAG 272 0.0054987 0.0039062 1.4076740
CAAT 216 0.0043666 0.0039062 1.1178587
CACA 81 0.0016375 0.0039062 0.4191970
CACC 131 0.0026483 0.0039062 0.6779606
CACG 139 0.0028100 0.0039062 0.7193628
CACT 105 0.0021227 0.0039062 0.5434035
CAGA 57 0.0011523 0.0039062 0.2949905
CAGC 303 0.0061254 0.0039062 1.5681074
CAGG 67 0.0013545 0.0039062 0.3467432
CAGT 87 0.0017588 0.0039062 0.4502487
CATA 127 0.0025674 0.0039062 0.6572595
CATC 326 0.0065904 0.0039062 1.6871386
CATG 182 0.0036793 0.0039062 0.9418995
CATT 286 0.0057817 0.0039062 1.4801278
CCAA 215 0.0043464 0.0039062 1.1126835
CCAC 87 0.0017588 0.0039062 0.4502487
CCAG 100 0.0020216 0.0039062 0.5175272
CCAT 213 0.0043060 0.0039062 1.109
CCCA 106 0.0021429 0.0039062 0.5485788
CCCC 135 0.0027291 0.0039062 0.6917
CCCG 212 0.0042858 0.0039062 1.096
CCCT 109 0.0022035 0.0039062 0.56
CCGA 276 0.0055796 0.0039062 1.42
CCGC 382 0.0077225 0.0039062 1.97
CCGG 294 0.0059435 0.0039062 1.521
CCGT 252 0.0050944 0.0039062 1.304
CCTA 36 0.0007278 0.0039062 0.1863098
CCTC 153 0.0030930 0.0039062 0.7918166
CCTG 67 0.0013545 0.0039062 0.3467432
CCTT 196 0.0039623 0.0039062 1.0143533
CGAA 328 0.0066308 0.0039062 1.6974892
CGAC 319 0.0064489 0.0039062 1.6509117
CGAG 241 0.0048720 0.0039062 1.2472405
CGAT 431 0.0087131 0.0039062 2.2305422
CGCA 247 0.0049933 0.0039062 1.2782922
CGCC 465 0.0094004 0.0039062 2.4065014
CGCG 358 0.0072373 0.0039062 1.8527473
CGCT 350 0.0070756 0.0039062 1.8113452
CGGA 283 0.0057211 0.0039062 1.4646019
CGGC 492 0.0099462 0.0039062 2.546
CGGG 212 0.0042858 0.0039062 1.09
CGGT 217 0.0043869 0.0039062 1.1230
CGTA 136 0.0027494 0.0039062 0.703
CGTC 381 0.0077023 0.0039062 1.971
CGTG 139 0.0028100 0.0039062 0.7193628
CGTT 321 0.0064893 0.0039062 1.6612623
CTAA 44 0.0008895 0.0039062 0.2220
CTAC 42 0.0008491 0.0039062 0.2173614
CTAG 12 0.0002426 0.0039062 0.063
CTAT 64 0.0012938 0.0039062 0.331
CTCA 131 0.0026483 0.0039062 0.676
CTCC 160 0.0032345 0.0039062 0.825
CTCG 241 0.0048720 0.0039062 1.2472405
CTCT 100 0.0020216 0.0039062 0.5175272
CTGA 143 0.0028909 0.0039062 0.74
CTGC 168 0.0033963 0.0039062 0.867
CTGG 100 0.0020216 0.0039062 0.51
CTGT 103 0.0020822 0.0062 0.5330
CTTA 61 0.0012332 0.0039062 0.3156916
CTTC 288 0.0058222 0.0039062 1.493
CTTG 272 0.0054987 0.0039062 1.4040
CTTT 305 0.0061659 0.0032 1.579
GAAA 399 0.0080661 0.0039062 2.064
GAAC 211 0.0042656 0.0039062 1.024
GAAG 288 0.0058222 0.0062 1.4904783
GAAT 166 0.0033558 0.0039062 0.8590951
GACA 188 0.0038006 0.0062 0.9729511
GACC 132 0.0026685 0.0039062 0.6831359
GACG 381 0.0077023 0.0039062 1.9717786
GACT 88 0.0017790 0.0039062 0.4554239
GAGA 117 0.0023653 0.0039062 0.6055068
GAGC 287 0.0058020 0.0039062 1.4853030
GAGG 153 0.0030930 0.0039062 0.7918166
GAGT 75 0.0015162 0.0039062 0.384
GATA 137 0.0027696 0.0039062 0.709
GATC 240 0.0048518 0.0039062 1.242
GATG 326 0.0065904 0.0039062 1.6876
GATT 226 0.0045688 0.0039062 1.16
GCAA 344 0.0069543 0.0039062 1.785
GCAC 151 0.0030526 0.0039062 0.781
GCAG 168 0.0033963 0.0039062 0.867
GCAT 230 0.0046497 0.0039062 1.195
GCCA 260 0.0052561 0.0039062 1.307
GCCC 186 0.0037602 0.0039062 0.9006
GCCG 492 0.0099462 0.0039062 2.5438
GCCT 148 0.0029920 0.0039062 0.7602
GCGA 367 0.0074192 0.0039062 1.8993248
GCGC 470 0.0095015 0.0039062 2.4323778
GCGG 382 0.0077225 0.0039062 1.9769539
GCGT 201 0.0040634 0.0039062 1.0402297
GCTA 54 0.0010917 0.0039062 0.2794647
GCTC 287 0.0058020 0.0039062 1.4853030
GCTG 303 0.0061254 0.0039062 1.5681074
GCTT 339 0.0068532 0.0039062 1.7544172
GGAA 295 0.0059637 0.0039062 1.5267052
GGAC 138 0.0027898 0.0039062 0.7141875
GGAG 160 0.0032345 0.0039062 0.8280435
GGAT 172 0.0034771 0.0039062 0.8901468
GGCA 250 0.0050540 0.0039062 1.2938180
GGCC 186 0.0037602 0.0039062 0.9626006
GGCG 465 0.0094004 0.0039062 2.4065014
GGCT 185 0.0037399 0.0039062 0.9574253
GGGA 169 0.0034165 0.0039062 0.874610
GGGC 186 0.0037602 0.0039062 0.962606
GGGG 135 0.0027291 0.0039062 0.6986617
GGGT 72 0.0014555 0.0039062 0.372196
GGTA 45 0.0009097 0.0039062 0.2328872
GGTC 132 0.0026685 0.39062 0.6831359
GGTG 131 0.0026483 0.39062 0.6779606
GGTT 113 0.0022844 0.39062 0.584857
GTAA 93 0.0018801 0.39062 0.48133
GTAC 86 0.0017386 0.39062 0.4450734
GTAG 42 0.0008491 0.0039062 0.2173614
GTAT 100 0.0020216 0.0039062 0.5175272
GTCA 241 0.0048720 0.0039062 1.2472405
GTCC 138 0.0027898 0.0039062 0.7141875
GTCG 319 0.0064489 0.0039062 1.6509117
GTCT 91 0.0018396 0.0039062 0.4709497
GTGA 127 0.0025674 0.0039062 0.6572595
GTGC 151 0.0030526 0.0039062 0.7814661
GTGG 87 0.0017588 0.0039062 0.4502487
GTGT 91 0.0018396 0.0039062 0.4709497
GTTA 52 0.0010512 0.0039062 0.2691141
GTTC 211 0.0042656 0.0039062 1.0919824
GTTG 206 0.0041645 0.0039062 1.0660
GTTT 301 0.0060850 0.0039062 1.558
TAAA 160 0.0032345 0.0039062 0.825
TAAC 52 0.0010512 0.0039062 0.261
TAAG 61 0.0012332 0.0039062 0.31
TAAT 110 0.0022237 0.0039062 0.569
TACA 76 0.0015364 0.0039062 0.397
TACC 45 0.0009097 0.0039062 0.23
TACG 136 0.0027494 0.0039062 0.70
TACT 64 0.0012938 0.0039062 0.33
TAGA 37 0.0007480 0.0039062 0.19
TAGC 54 0.0010917 0.0039062 0.2794647
TAGG 36 0.0007278 0.0039062 0.1863098
TAGT 35 0.0007076 0.0039062 0.1811345
TATA 60 0.0012130 0.0039062 0.3105163
TATC 137 0.0027696 0.0039062 0.7090123
TATG 127 0.0025674 0.0039062 0.6572595
TATT 129 0.0026079 0.0039062 0.6676101
TCAA 316 0.0063882 0.0039062 1.6353859
TCAC 127 0.0025674 0.0039062 0.6572595
TCAG 143 0.0028909 0.0039062 0.7400639
TCAT 311 0.0062871 0.0039062 1.6095096
TCCA 169 0.0034165 0.0039062 0.8746210
TCCC 169 0.0034165 0.0039062 0.8746210
TCCG 283 0.0057211 0.0039062 1.4646019
TCCT 144 0.0029111 0.0039062 0.7452392
TCGA 354 0.0071564 0.0039062 1.8320463
TCGC 367 0.0074192 0.0039062 1.8993248
TCGG 276 0.0055796 0.0039062 1.4283750
TCGT 322 0.0065095 0.0039062 1.6664376
TCTA 37 0.0007480 0.0039062 0.1914851
TCTC 117 0.0023653 0.0039062 0.6055068
TCTG 57 0.0011523 0.0039062 0.2949905
TCTT 222 0.0044879 0.0039062 1.1489104
TGAA 283 0.0057211 0.0039062 1.4646019
TGAC 241 0.0048720 0.0039062 1.2472405
TGAG 131 0.0026483 0.0039062 0.6779606
TGAT 242 0.0048922 0.0039062 1.2524158
TGCA 166 0.0033558 0.0039062 0.8590951
TGCC 250 0.0050540 0.0039062 1.2938180
TGCG 247 0.0049933 0.0039062 1.2782922
TGCT 230 0.0046497 0.0039062 1.1903125
TGGA 169 0.0034165 0.39062 0.8746210
TGGC 260 0.0052561 0.39062 1.3455707
TGGG 106 0.0021429 0.39062 0.5485788
TGGT 80 0.0016173 0.39062 0.4140218
TGTA 76 0.0015364 0.39062 0.3933207
TGTC 188 0.0038006 0.39062 0.9729511
TGTG 81 0.0016375 0.39062 0.4191970
TGTT 227 0.0045890 0.39062 1.1747867
TTAA 110 0.0022237 0.39062 0.5692799
TTAC 93 0.0018801 0.39062 0.4813003
TTAG 44 0.0008895 0.39062 0.2277120
TTAT 136 0.0027494 0.39062 0.7038370
TTCA 283 0.0057211 0.39062 1.4646019
TTCC 295 0.0059637 0.39062 1.5267052
TTCG 328 0.0066308 0.39062 1.6974892
TTCT 158 0.0031941 0.39062 0.8176930
TTGA 316 0.0063882 0.39062 1.6353859
TTGC 344 0.0069543 0.39062 1.7802935
TTGG 215 0.0043464 0.39062 1.1126835
TTGT 211 0.0042656 0.39062 1.0919824
TTTA 160 0.0032345 0.0039062 0.8280435
TTTC 399 0.0080661 0.0039062 2.0649335
TTTG 392 0.0079246 0.0039062 2.0287066
TTTT 573 0.0115837 0.0039062 2.9654308
Other 0 0.0000000 0.0000000 10000000000.0000000
#
# Output from 'compseq'
#
# The Expected frequencies are calculated on the (false) assumption that every
# word has equal frequency.
#
# The input sequences are:
# s21_contig00002
Word size 4
Total count 29078
#
# Word Obs Count Obs Frequency Exp Frequency Obs/Exp Frequency
#
AAAA 364 0.0125181 0.0039062 3.2046221
AAAC 202 0.0069468 0.0039062 1.7783892
AAAG 170 0.0058463 0.0039062 1.4966641
AAAT 227 0.0078066 0.0039062 1.9984868
AACA 178 0.0061215 0.0039062 1.5670954
AACC 87 0.0029920 0.0039062 0.7659399
AACG 168 0.0057776 0.0039062 1.4790563
AACT 82 0.0028200 0.39062 0.7219204
AAGA 146 0.0050210 0.39062 1.2853704
AAGC 188 0.0064654 0.39062 1.6551345
AAGG 142 0.0048834 0.39062 1.2501548
AAGT 87 0.0029920 0.39062 0.7659399
AATA 150 0.0051585 0.39062 1.3205860
AATC 153 0.0052617 0.39062 1.3469977
AATG 160 0.0055024 0.0039062 1.4086251
AATT 140 0.0048146 0.0039062 1.2325469
ACAA 183 0.0062934 0.0039062 1.6111149
ACAC 72 0.0024761 0.0039062 0.6338813
ACAG 92 0.0031639 0.0039062 0.8099594
ACAT 122 0.0041956 0.0039062 1.0740766
ACCA 71 0.0024417 0.0039062 0.6250774
ACCC 46 0.0015820 0.0039062 0.4049797
ACCG 122 0.0041956 0.0039062 1.0740766
ACCT 42 0.0014444 0.0039062 0.3697641
ACGA 138 0.0047459 0.0039062 1.2149391
ACGC 89 0.0030607 0.0039062 0.7835477
ACGG 102 0.0035078 0.0039062 0.8979985
ACGT 82 0.0028200 0.0039062 0.7219204
ACTA 40 0.0013756 0.0039062 0.3521563
ACTC 46 0.0015820 0.0039062 0.4049797
ACTG 64 0.0022010 0.0039062 0.5634500
ACTT 87 0.0029920 0.0039062 0.7659399
AGAA 140 0.0048146 0.0039062 1.2325469
AGAC 56 0.0019259 0.0039062 0.4930188
AGAG 61 0.0020978 0.0039062 0.5370383
AGAT 77 0.0026481 0.0039062 0.6779008
AGCA 145 0.0049866 0.0039062 1.2765665
AGCC 103 0.0035422 0.0039062 0.9068024
AGCG 170 0.0058463 0.0039062 1.4966641
AGCT 86 0.0029576 0.0039062 0.7571360
AGGA 118 0.0040581 0.0039062 1.0388610
AGGC 91 0.0031295 0.0039062 0.8011555
AGGG 84 0.0028888 0.0039062 0.7395282
AGGT 42 0.0014444 0.0039062 0.3697641
AGTA 47 0.0016163 0.0039062 0.4137836
AGTC 46 0.0015820 0.0039062 0.4049797
AGTG 62 0.0021322 0.0039062 0.5458422
AGTT 82 0.0028200 0.0039062 0.7219204
ATAA 120 0.0041268 0.0039062 1.0564688
ATAC 86 0.0029576 0.0039062 0.7571360
ATAG 76 0.0026137 0.0039062 0.6690969
ATAT 170 0.0058463 0.0039062 1.4966641
ATCA 141 0.0048490 0.0039062 1.2413508
ATCC 117 0.0040237 0.0039062 1.0300571
ATCG 204 0.0070156 0.0039062 1.7959970
ATCT 77 0.0026481 0.0039062 0.6779008
ATGA 197 0.0067749 0.0039062 1.7343696
ATGC 122 0.0041956 0.0039062 1.0740766
ATGG 147 0.0050554 0.0039062 1.2941743
ATGT 122 0.0041956 0.0039062 1.0740766
ATTA 85 0.0029232 0.0039062 0.7483321
ATTC 153 0.0052617 0.0039062 1.3469977
ATTG 138 0.0047459 0.0039062 1.2149391
ATTT 227 0.0078066 0.0039062 1.9984868
CAAA 234 0.0080473 0.0039062 2.0601142
CAAC 136 0.0046771 0.0039062 1.1973313
CAAG 155 0.0053305 0.0039062 1.3646055
CAAT 138 0.0047459 0.0039062 1.2149391
CACA 81 0.0027856 0.0039062 0.7131164
CACC 88 0.0030263 0.0039062 0.7747438
CACG 72 0.0024761 0.0039062 0.6338813
CACT 62 0.0021322 0.0039062 0.5458422
CAGA 52 0.0017883 0.0039062 0.4578032
CAGC 152 0.0052273 0.0039062 1.3381938
CAGG 55 0.0018915 0.0039062 0.4842149
CAGT 64 0.0022010 0.0039062 0.5634500
CATA 108 0.0037141 0.0039062 0.9508219
CATC 194 0.0066717 0.0039062 1.7079579
CATG 126 0.0043332 0.0039062 1.1092922
CATT 160 0.0055024 0.0039062 1.4086251
CCAA 144 0.0049522 0.0039062 1.2677626
CCAC 71 0.0024417 0.0039062 0.6250774
CCAG 63 0.0021666 0.0039062 0.5546461
CCAT 147 0.0050554 0.0039062 1.2941743
CCCA 77 0.0026481 0.0039062 0.6779008
CCCC 94 0.0032327 0.0039062 0.8275672
CCCG 81 0.0027856 0.0039062 0.7131164
CCCT 84 0.0028888 0.0039062 0.7395282
CCGA 110 0.0037829 0.0039062 0.9684297
CCGC 167 0.0057432 0.0039062 1.4702524
CCGG 110 0.0037829 0.0039062 0.9684297
CCGT 102 0.0035078 0.0039062 0.8979985
CCTA 49 0.0016851 0.0039062 0.4313914
CCTC 90 0.0030951 0.0039062 0.7923516
CCTG 55 0.0018915 0.0039062 0.4842149
CCTT 142 0.0048834 0.0039062 1.2501548
CGAA 162 0.0055712 0.0039062 1.4262329
CGAC 101 0.0034734 0.0039062 0.8891946
CGAG 96 0.0033015 0.0039062 0.8451750
CGAT 204 0.0070156 0.0039062 1.7959970
CGCA 94 0.0032327 0.0039062 0.8275672
CGCC 183 0.0062934 0.0039062 1.6111149
CGCG 120 0.0041268 0.0039062 1.0564688
CGCT 170 0.0058463 0.0039062 1.4966641
CGGA 116 0.0039893 0.0039062 1.0212532
CGGC 171 0.0058807 0.0039062 1.5054681
CGGG 81 0.0027856 0.0039062 0.7131164
CGGT 122 0.0041956 0.0039062 1.0740766
CGTA 61 0.0020978 0.0039062 0.5370383
CGTC 110 0.0037829 0.0039062 0.9684297
CGTG 72 0.0024761 0.0039062 0.6338813
CGTT 168 0.0057776 0.0039062 1.4790563
CTAA 47 0.0016163 0.0039062 0.4137836
CTAC 46 0.0015820 0.0039062 0.4049797
CTAG 20 0.0006878 0.0039062 0.1760781
CTAT 76 0.0026137 0.0039062 0.6690969
CTCA 70 0.0024073 0.0039062 0.6162735
CTCC 109 0.0037485 0.0039062 0.9596258
CTCG 96 0.0033015 0.0039062 0.8451750
CTCT 61 0.0020978 0.0039062 0.5370383
CTGA 71 0.0024417 0.0039062 0.6250774
CTGC 97 0.0033359 0.0039062 0.8539790
CTGG 63 0.0021666 0.0039062 0.5546461
CTGT 92 0.0031639 0.0039062 0.8099594
CTTA 69 0.0023729 0.0039062 0.6074696
CTTC 169 0.0058120 0.0039062 1.4878602
CTTG 155 0.0053305 0.0039062 1.3646055
CTTT 170 0.0058463 0.0039062 1.4966641
GAAA 247 0.0084944 0.0039062 2.1745650
GAAC 126 0.0043332 0.0039062 1.1092922
GAAG 169 0.0058120 0.0039062 1.4878602
GAAT 153 0.0052617 0.0039062 1.3469977
GACA 110 0.0037829 0.0039062 0.9684297
GACC 60 0.0020634 0.0039062 0.5282344
GACG 110 0.0037829 0.0039062 0.9684297
GACT 46 0.0015820 0.0039062 0.4049797
GAGA 93 0.0031983 0.0039062 0.8187633
GAGC 107 0.0036798 0.0039062 0.9420180
GAGG 90 0.0030951 0.0039062 0.7923516
GAGT 46 0.0015820 0.0039062 0.4049797
GATA 80 0.0027512 0.0039062 0.7043125
GATC 112 0.0038517 0.0039062 0.9860376
GATG 194 0.0066717 0.0039062 1.7079579
GATT 153 0.0052617 0.0039062 1.3469977
GCAA 172 0.0059151 0.0039062 1.5142720
GCAC 73 0.0025105 0.0039062 0.6426852
GCAG 97 0.0033359 0.0039062 0.8539790
GCAT 122 0.0041956 0.0039062 1.0740766
GCCA 146 0.0050210 0.0039062 1.2853704
GCCC 81 0.0027856 0.0039062 0.7131164
GCCG 171 0.0058807 0.0039062 1.5054681
GCCT 91 0.0031295 0.0039062 0.8011555
GCGA 151 0.0051929 0.0039062 1.3293899
GCGC 160 0.0055024 0.0039062 1.4086251
GCGG 167 0.0057432 0.0039062 1.4702524
GCGT 89 0.0030607 0.0039062 0.7835477
GCTA 57 0.0019602 0.0039062 0.5018227
GCTC 107 0.0036798 0.0039062 0.9420180
GCTG 152 0.0052273 0.0039062 1.3381938
GCTT 188 0.0064654 0.0039062 1.6551345
GGAA 188 0.0064654 0.0039062 1.6551345
GGAC 66 0.0022698 0.0039062 0.5810578
GGAG 109 0.0037485 0.0039062 0.9596258
GGAT 117 0.0040237 0.0039062 1.0300571
GGCA 133 0.0045739 0.0039062 1.1709196
GGCC 70 0.0024073 0.0039062 0.6162735
GGCG 183 0.0062934 0.0039062 1.6111149
GGCT 103 0.0035422 0.0039062 0.9068024
GGGA 115 0.0039549 0.0039062 1.0124493
GGGC 81 0.0027856 0.0039062 0.7131164
GGGG 94 0.0032327 0.0039062 0.8275672
GGGT 46 0.0015820 0.0039062 0.4049797
GGTA 46 0.0015820 0.0039062 0.4049797
GGTC 60 0.0020634 0.0039062 0.5282344
GGTG 88 0.0030263 0.0039062 0.7747438
GGTT 87 0.0029920 0.0039062 0.7659399
GTAA 70 0.0024073 0.0039062 0.6162735
GTAC 52 0.0017883 0.0039062 0.4578032
GTAG 46 0.0015820 0.0039062 0.4049797
GTAT 86 0.0029576 0.0039062 0.7571360
GTCA 103 0.0035422 0.0039062 0.9068024
GTCC 66 0.0022698 0.0039062 0.5810578
GTCG 101 0.0034734 0.0039062 0.8891946
GTCT 56 0.0019259 0.0039062 0.4930188
GTGA 87 0.0029920 0.0039062 0.7659399
GTGC 73 0.0025105 0.0039062 0.6426852
GTGG 71 0.0024417 0.0039062 0.6250774
GTGT 72 0.0024761 0.0039062 0.6338813
GTTA 51 0.0017539 0.0039062 0.4489992
GTTC 126 0.0043332 0.0039062 1.1092922
GTTG 136 0.0046771 0.0039062 1.1973313
GTTT 202 0.0069468 0.0039062 1.7783892
TAAA 118 0.0040581 0.0039062 1.0388610
TAAC 51 0.0017539 0.0039062 0.4489992
TAAG 69 0.0023729 0.0039062 0.6074696
TAAT 85 0.0029232 0.0039062 0.7483321
TACA 100 0.0034390 0.0039062 0.8803907
TACC 46 0.0015820 0.0039062 0.4049797
TACG 61 0.0020978 0.0039062 0.5370383
TACT 47 0.0016163 0.0039062 0.4137836
TAGA 43 0.0014788 0.0039062 0.3785680
TAGC 57 0.0019602 0.0039062 0.5018227
TAGG 49 0.0016851 0.0039062 0.4313914
TAGT 40 0.0013756 0.0039062 0.3521563
TATA 114 0.0039205 0.0039062 1.0036454
TATC 80 0.0027512 0.0039062 0.7043125
TATG 108 0.0037141 0.0039062 0.9508219
TATT 150 0.0051585 0.0039062 1.3205860
TCAA 164 0.0056400 0.0039062 1.4438407
TCAC 87 0.0029920 0.0039062 0.7659399
TCAG 71 0.0024417 0.0039062 0.6250774
TCAT 197 0.0067749 0.0039062 1.7343696
TCCA 131 0.0045051 0.0039062 1.1533118
TCCC 115 0.0039549 0.0039062 1.0124493
TCCG 116 0.0039893 0.0039062 1.0212532
TCCT 118 0.0040581 0.0039062 1.0388610
TCGA 164 0.0056400 0.0039062 1.4438407
TCGC 151 0.0051929 0.0039062 1.3293899
TCGG 110 0.0037829 0.0039062 0.9684297
TCGT 138 0.0047459 0.0039062 1.2149391
TCTA 43 0.0014788 0.0039062 0.3785680
TCTC 93 0.0031983 0.0039062 0.8187633
TCTG 52 0.0017883 0.0039062 0.4578032
TCTT 146 0.0050210 0.0039062 1.2853704
TGAA 205 0.0070500 0.0039062 1.8048009
TGAC 103 0.0035422 0.0039062 0.9068024
TGAG 70 0.0024073 0.0039062 0.6162735
TGAT 141 0.0048490 0.0039062 1.2413508
TGCA 92 0.0031639 0.0039062 0.8099594
TGCC 133 0.0045739 0.0039062 1.1709196
TGCG 94 0.0032327 0.0039062 0.8275672
TGCT 145 0.0049866 0.0039062 1.2765665
TGGA 131 0.0045051 0.0039062 1.1533118
TGGC 146 0.0050210 0.0039062 1.2853704
TGGG 77 0.0026481 0.0039062 0.6779008
TGGT 71 0.0024417 0.0039062 0.6250774
TGTA 100 0.0034390 0.0039062 0.8803907
TGTC 110 0.0037829 0.0039062 0.9684297
TGTG 81 0.0027856 0.0039062 0.7131164
TGTT 178 0.0061215 0.0039062 1.5670954
TTAA 86 0.0029576 0.0039062 0.7571360
TTAC 70 0.0024073 0.0039062 0.6162735
TTAG 47 0.0016163 0.0039062 0.4137836
TTAT 120 0.0041268 0.0039062 1.0564688
TTCA 205 0.0070500 0.0039062 1.8048009
TTCC 188 0.0064654 0.0039062 1.6551345
TTCG 162 0.0055712 0.0039062 1.4262329
TTCT 140 0.0048146 0.0039062 1.2325469
TTGA 164 0.0056400 0.0039062 1.4438407
TTGC 172 0.0059151 0.0039062 1.5142720
TTGG 144 0.0049522 0.0039062 1.2677626
TTGT 183 0.0062934 0.0039062 1.6111149
TTTA 118 0.0040581 0.0039062 1.0388610
TTTC 247 0.0084944 0.0039062 2.1745650
TTTG 234 0.0080473 0.0039062 2.0601142
TTTT 364 0.0125181 0.0039062 3.2046221
Other 0 0.0000000 0.0000000 10000000000.0000000
我想捕获第一个块的第一列和第三列(Word和Obs频率)(从仅包含#的行开始并在包含"其他"的行中结束)并转置它们。从下面的块中,我只想在第一个转置下转置Obs Frequency。输出文件应如下所示:
AAAA AAAC AAAG AAAT AACA AACC AACG AACT AAGA AAGC AAGG AAGT AATA AATC AATG AATT ACAA ACAC ACAG ACAT ACCA ACCC ACCG ACCT ACGA ACGC ACGG ACGT ACTA ACTC ACTG ACTT AGAA AGAC AGAG AGAT AGCA AGCC AGCG AGCT AGGA AGGC AGGG AGGT AGTA AGTC AGTG AGTT ATAA ATAC ATAG ATAT ATCA ATCC ATCG ATCT ATGA ATGC ATGG ATGT ATTA ATTC ATTG ATTT CAAA CAAC CAAG CAAT CACA CACC CACG CACT CAGA CAGC CAGG CAGT CATA CATC CATG CATT CCAA CCAC CCAG CCAT CCCA CCCC CCCG CCCT CCGA CCGC CCGG CCGT CCTA CCTC CCTG CCTT CGAA CGAC CGAG CGAT CGCA CGCC CGCG CGCT CGGA CGGC CGGG CGGT CGTA CGTC CGTG CGTT CTAA CTAC CTAG CTAT CTCA CTCC CTCG CTCT CTGA CTGC CTGG CTGT CTTA CTTC CTTG CTTT GAAA GAAC GAAG GAAT GACA GACC GACG GACT GAGA GAGC GAGG GAGT GATA GATC GATG GATT GCAA GCAC GCAG GCAT GCCA GCCC GCCG GCCT GCGA GCGC GCGG GCGT GCTA GCTC GCTG GCTT GGAA GGAC GGAG GGAT GGCA GGCC GGCG GGCT GGGA GGGC GGGG GGGT GGTA GGTC GGTG GGTT GTAA GTAC GTAG GTAT GTCA GTCC GTCG GTCT GTGA GTGC GTGG GTGT GTTA GTTC GTTG GTTT TAAA TAAC TAAG TAAT TACA TACC TACG TACT TAGA TAGC TAGG TAGT TATA TATC TATG TATT TCAA TCAC TCAG TCAT TCCA TCCC TCCG TCCT TCGA TCGC TCGG TCGT TCTA TCTC TCTG TCTT TGAA TGAC TGAG TGAT TGCA TGCC TGCG TGCT TGGA TGGC TGGG TGGT TGTA TGTC TGTG TGTT TTAA TTAC TTAG TTAT TTCA TTCC TTCG TTCT TTGA TTGC TTGG TTGT TTTA TTTC TTTG TTTT
s21_contig00001 0.0125181 0.0069468 0.0058463 0.0078066 0.0061215 0.0029920 0.0057776 0.0028200 0.0050210 0.0064654 0.0048834 0.0029920 0.0051585 0.0052617 0.0055024 0.0048146 0.0062934 0.0024761 0.0031639 0.0041956 0.0024417 0.0015820 0.0041956 0.0014444 0.0047459 0.0030607 0.0035078 0.0028200 0.0013756 0.0015820 0.0022010 0.0029920 0.0048146 0.0019259 0.0020978 0.0026481 0.0049866 0.0035422 0.0058463 0.0029576 0.0040581 0.0031295 0.0028888 0.0014444 0.0016163 0.0015820 0.0021322 0.0028200 0.0041268 0.0029576 0.0026137 0.0058463 0.0048490 0.0040237 0.0070156 0.0026481 0.0067749 0.0041956 0.0050554 0.0041956 0.0029232 0.0052617 0.0047459 0.0078066 0.0080473 0.0046771 0.0053305 0.0047459 0.0027856 0.0030263 0.0024761 0.0021322 0.0017883 0.0052273 0.0018915 0.0022010 0.0037141 0.0066717 0.0043332 0.0055024 0.0049522 0.0024417 0.0021666 0.0050554 0.0026481 0.0032327 0.0027856 0.0028888 0.0037829 0.0057432 0.0037829 0.0035078 0.0016851 0.0030951 0.0018915 0.0048834 0.0055712 0.0034734 0.0033015 0.0070156 0.0032327 0.0062934 0.0041268 0.0058463 0.0039893 0.0058807 0.0027856 0.0041956 0.0020978 0.0037829 0.0024761 0.0057776 0.0016163 0.0015820 0.0006878 0.0026137 0.0024073 0.0037485 0.0033015 0.0020978 0.0024417 0.0033359 0.0021666 0.0031639 0.0023729 0.0058120 0.0053305 0.0058463 0.0084944 0.0043332 0.0058120 0.0052617 0.0037829 0.0020634 0.0037829 0.0015820 0.0031983 0.0036798 0.0030951 0.0015820 0.0027512 0.0038517 0.0066717 0.0052617 0.0059151 0.0025105 0.0033359 0.0041956 0.0050210 0.0027856 0.0058807 0.0031295 0.0051929 0.0055024 0.0057432 0.0030607 0.0019602 0.0036798 0.0052273 0.0064654 0.0064654 0.0022698 0.0037485 0.0040237 0.0045739 0.0024073 0.0062934 0.0035422 0.0039549 0.0027856 0.0032327 0.0015820 0.0015820 0.0020634 0.0030263 0.0029920 0.0024073 0.0017883 0.0015820 0.0029576 0.0035422 0.0022698 0.0034734 0.0019259 0.0029920 0.0025105 0.0024417 0.0024761 0.0017539 0.0043332 0.0046771 0.0069468 0.0040581 0.0017539 0.0023729 0.0029232 0.0034390 0.0015820 0.0020978 0.0016163 0.0014788 0.0019602 0.0016851 0.0013756 0.0039205 0.0027512 0.0037141 0.0051585 0.0056400 0.0029920 0.0024417 0.0067749 0.0045051 0.0039549 0.0039893 0.0040581 0.0056400 0.0051929 0.0037829 0.0047459 0.0014788 0.0031983 0.0017883 0.0050210 0.0070500 0.0035422 0.0024073 0.0048490 0.0031639 0.0045739 0.0032327 0.0049866 0.0045051 0.0050210 0.0026481 0.0024417 0.0034390 0.0037829 0.0027856 0.0061215 0.0029576 0.0024073 0.0016163 0.0041268 0.0070500 0.0064654 0.0055712 0.0048146 0.0056400 0.0059151 0.0049522 0.0062934 0.0040581 0.0084944 0.0080473 0.0125181
s21_contig00002 0.0125181 0.0069468 0.0058463 0.0078066 0.0061215 0.0029920 0.0057776 0.0028200 0.0050210 0.0064654 0.0048834 0.0029920 0.0051585 0.0052617 0.0055024 0.0048146 0.0062934 0.0024761 0.0031639 0.0041956 0.0024417 0.0015820 0.0041956 0.0014444 0.0047459 0.0030607 0.0035078 0.0028200 0.0013756 0.0015820 0.0022010 0.0029920 0.0048146 0.0019259 0.0020978 0.0026481 0.0049866 0.0035422 0.0058463 0.0029576 0.0040581 0.0031295 0.0028888 0.0014444 0.0016163 0.0015820 0.0021322 0.0028200 0.0041268 0.0029576 0.0026137 0.0058463 0.0048490 0.0040237 0.0070156 0.0026481 0.0067749 0.0041956 0.0050554 0.0041956 0.0029232 0.0052617 0.0047459 0.0078066 0.0080473 0.0046771 0.0053305 0.0047459 0.0027856 0.0030263 0.0024761 0.0021322 0.0017883 0.0052273 0.0018915 0.0022010 0.0037141 0.0066717 0.0043332 0.0055024 0.0049522 0.0024417 0.0021666 0.0050554 0.0026481 0.0032327 0.0027856 0.0028888 0.0037829 0.0057432 0.0037829 0.0035078 0.0016851 0.0030951 0.0018915 0.0048834 0.0055712 0.0034734 0.0033015 0.0070156 0.0032327 0.0062934 0.0041268 0.0058463 0.0039893 0.0058807 0.0027856 0.0041956 0.0020978 0.0037829 0.0024761 0.0057776 0.0016163 0.0015820 0.0006878 0.0026137 0.0024073 0.0037485 0.0033015 0.0020978 0.0024417 0.0033359 0.0021666 0.0031639 0.0023729 0.0058120 0.0053305 0.0058463 0.0084944 0.0043332 0.0058120 0.0052617 0.0037829 0.0020634 0.0037829 0.0015820 0.0031983 0.0036798 0.0030951 0.0015820 0.0027512 0.0038517 0.0066717 0.0052617 0.0059151 0.0025105 0.0033359 0.0041956 0.0050210 0.0027856 0.0058807 0.0031295 0.0051929 0.0055024 0.0057432 0.0030607 0.0019602 0.0036798 0.0052273 0.0064654 0.0064654 0.0022698 0.0037485 0.0040237 0.0045739 0.0024073 0.0062934 0.0035422 0.0039549 0.0027856 0.0032327 0.0015820 0.0015820 0.0020634 0.0030263 0.0029920 0.0024073 0.0017883 0.0015820 0.0029576 0.0035422 0.0022698 0.0034734 0.0019259 0.0029920 0.0025105 0.0024417 0.0024761 0.0017539 0.0043332 0.0046771 0.0069468 0.0040581 0.0017539 0.0023729 0.0029232 0.0034390 0.0015820 0.0020978 0.0016163 0.0014788 0.0019602 0.0016851 0.0013756 0.0039205 0.0027512 0.0037141 0.0051585 0.0056400 0.0029920 0.0024417 0.0067749 0.0045051 0.0039549 0.0039893 0.0040581 0.0056400 0.0051929 0.0037829 0.0047459 0.0014788 0.0031983 0.0017883 0.0050210 0.0070500 0.0035422 0.0024073 0.0048490 0.0031639 0.0045739 0.0032327 0.0049866 0.0045051 0.0050210 0.0026481 0.0024417 0.0034390 0.0037829 0.0027856 0.0061215 0.0029576 0.0024073 0.0016163 0.0041268 0.0070500 0.0064654 0.0055712 0.0048146 0.0056400 0.0059151 0.0049522 0.0062934 0.0040581 0.0084944 0.0080473 0.0125181
重要的是,每个块的标识符都带有模式" 21_contig"位于声明"输入序列为:"应该放在第一列,替换" Obs Frequency"。
答案 0 :(得分:2)
这个awk脚本应该做你需要的:
/^# +s21_contig/ { sequence[++seqcnt] = $2 }
{ map[sequence[seqcnt], $1] = $3 }
/^[ACGT]+ / && !seen[$1]++ { words[++wordcnt] = $1 }
END {
for (word=1; word<=wordcnt; word++) {
printf "\t%s", words[word]
}
print ""
for (seqnum=1; seqnum<=seqcnt; seqnum++) {
printf "%s ", sequence[seqnum];
for (word=1; word<=wordcnt; word++) {
printf "%s%s", map[sequence[seqnum],words[word]], (word==wordcnt ? RS : FS)
}
}
}
awk -f script.awk file
AAAA AAAC AAAG AAAT AACA AACC AACG AACT AAGA AAGC AAGG AAGT AATA AATC AATG AATT ACAA ACAC ACAG ACAT ACCA ACCC ACCG ACCT ACGA ACGC ACGG ACGT ACTA ACTC ACTG ACTT AGAA AGAC AGAG AGAT AGCA AGCC AGCG AGCT AGGA AGGC AGGG AGGT AGTA AGTC AGTG AGTT ATAA ATAC ATAG ATAT ATCA ATCC ATCG ATCT ATGA ATGC ATGG ATGT ATTA ATTC ATTG ATTT CAAA CAAC CAAG CAAT CACA CACC CACG CACT CAGA CAGC CAGG CAGT CATA CATC CATG CATT CCAA CCAC CCAG CCAT CCCA CCCC CCCG CCCT CCGA CCGC CCGG CCGT CCTA CCTC CCTG CCTT CGAA CGAC CGAG CGAT CGCA CGCC CGCG CGCT CGGA CGGC CGGG CGGT CGTA CGTC CGTG CGTT CTAA CTAC CTAG CTAT CTCA CTCC CTCG CTCT CTGA CTGC CTGG CTGT CTTA CTTC CTTG CTTT GAAA GAAC GAAG GAAT GACA GACC GACG GACT GAGA GAGC GAGG GAGT GATA GATC GATG GATT GCAA GCAC GCAG GCAT GCCA GCCC GCCG GCCT GCGA GCGC GCGG GCGT GCTA GCTC GCTG GCTT GGAA GGAC GGAG GGAT GGCA GGCC GGCG GGCT GGGA GGGC GGGG GGGT GGTA GGTC GGTG GGTT GTAA GTAC GTAG GTAT GTCA GTCC GTCG GTCT GTGA GTGC GTGG GTGT GTTA GTTC GTTG GTTT TAAA TAAC TAAG TAAT TACA TACC TACG TACT TAGA TAGC TAGG TAGT TATA TATC TATG TATT TCAA TCAC TCAG TCAT TCCA TCCC TCCG TCCT TCGA TCGC TCGG TCGT TCTA TCTC TCTG TCTT TGAA TGAC TGAG TGAT TGCA TGCC TGCG TGCT TGGA TGGC TGGG TGGT TGTA TGTC TGTG TGTT TTAA TTAC TTAG TTAT TTCA TTCC TTCG TTCT TTGA TTGC TTGG TTGT TTTA TTTC TTTG TTTT
s21_contig00001 0.0115837 0.0060850 0.0061659 0.0069745 0.0045890 0.0022844 0.0064893 0.0022035 0.0044879 0.0068532 0.0039623 0.0034165 0.0026079 0.0045688 0.0057817 0.0039623 0.0042656 0.0018396 0.0020822 0.0033761 0.0016173 0.0014555 0.0043869 0.0010512 0.0065095 0.0040634 0.0050944 0.0040836 0.0007076 0.0015162 0.0017588 0.0034165 0.0031941 0.0018396 0.0020216 0.0016981 0.0046497 0.0037399 0.0070756 0.0044071 0.0029111 0.0029920 0.0022035 0.0010512 0.0012938 0.0017790 0.0021227 0.0022035 0.0027494 0.0020216 0.0012938 0.0031132 0.0048922 0.0034771 0.0087131 0.0016981 0.0062871 0.0046497 0.0043060 0.0033761 0.0022237 0.0033558 0.0043666 0.0069745 0.0079246 0.0041645 0.0054987 0.0043666 0.0016375 0.0026483 0.0028100 0.0021227 0.0011523 0.0061254 0.0013545 0.0017588 0.0025674 0.0065904 0.0036793 0.0057817 0.0043464 0.0017588 0.0020216 0.0043060 0.0021429 0.0027291 0.0042858 0.0022035 0.0055796 0.0077225 0.0059435 0.0050944 0.0007278 0.0030930 0.0013545 0.0039623 0.0066308 0.0064489 0.0048720 0.0087131 0.0049933 0.0094004 0.0072373 0.0070756 0.0057211 0.0099462 0.0042858 0.0043869 0.0027494 0.0077023 0.0028100 0.0064893 0.0008895 0.0008491 0.0002426 0.0012938 0.0026483 0.0032345 0.0048720 0.0020216 0.0028909 0.0033963 0.0020216 0.0020822 0.0012332 0.0058222 0.0054987 0.0061659 0.0080661 0.0042656 0.0058222 0.0033558 0.0038006 0.0026685 0.0077023 0.0017790 0.0023653 0.0058020 0.0030930 0.0015162 0.0027696 0.0048518 0.0065904 0.0045688 0.0069543 0.0030526 0.0033963 0.0046497 0.0052561 0.0037602 0.0099462 0.0029920 0.0074192 0.0095015 0.0077225 0.0040634 0.0010917 0.0058020 0.0061254 0.0068532 0.0059637 0.0027898 0.0032345 0.0034771 0.0050540 0.0037602 0.0094004 0.0037399 0.0034165 0.0037602 0.0027291 0.0014555 0.0009097 0.0026685 0.0026483 0.0022844 0.0018801 0.0017386 0.0008491 0.0020216 0.0048720 0.0027898 0.0064489 0.0018396 0.0025674 0.0030526 0.0017588 0.0018396 0.0010512 0.0042656 0.0041645 0.0060850 0.0032345 0.0010512 0.0012332 0.0022237 0.0015364 0.0009097 0.0027494 0.0012938 0.0007480 0.0010917 0.0007278 0.0007076 0.0012130 0.0027696 0.0025674 0.0026079 0.0063882 0.0025674 0.0028909 0.0062871 0.0034165 0.0034165 0.0057211 0.0029111 0.0071564 0.0074192 0.0055796 0.0065095 0.0007480 0.0023653 0.0011523 0.0044879 0.0057211 0.0048720 0.0026483 0.0048922 0.0033558 0.0050540 0.0049933 0.0046497 0.0034165 0.0052561 0.0021429 0.0016173 0.0015364 0.0038006 0.0016375 0.0045890 0.0022237 0.0018801 0.0008895 0.0027494 0.0057211 0.0059637 0.0066308 0.0031941 0.0063882 0.0069543 0.0043464 0.0042656 0.0032345 0.0080661 0.0079246 0.0115837
s21_contig00002 0.0125181 0.0069468 0.0058463 0.0078066 0.0061215 0.0029920 0.0057776 0.0028200 0.0050210 0.0064654 0.0048834 0.0029920 0.0051585 0.0052617 0.0055024 0.0048146 0.0062934 0.0024761 0.0031639 0.0041956 0.0024417 0.0015820 0.0041956 0.0014444 0.0047459 0.0030607 0.0035078 0.0028200 0.0013756 0.0015820 0.0022010 0.0029920 0.0048146 0.0019259 0.0020978 0.0026481 0.0049866 0.0035422 0.0058463 0.0029576 0.0040581 0.0031295 0.0028888 0.0014444 0.0016163 0.0015820 0.0021322 0.0028200 0.0041268 0.0029576 0.0026137 0.0058463 0.0048490 0.0040237 0.0070156 0.0026481 0.0067749 0.0041956 0.0050554 0.0041956 0.0029232 0.0052617 0.0047459 0.0078066 0.0080473 0.0046771 0.0053305 0.0047459 0.0027856 0.0030263 0.0024761 0.0021322 0.0017883 0.0052273 0.0018915 0.0022010 0.0037141 0.0066717 0.0043332 0.0055024 0.0049522 0.0024417 0.0021666 0.0050554 0.0026481 0.0032327 0.0027856 0.0028888 0.0037829 0.0057432 0.0037829 0.0035078 0.0016851 0.0030951 0.0018915 0.0048834 0.0055712 0.0034734 0.0033015 0.0070156 0.0032327 0.0062934 0.0041268 0.0058463 0.0039893 0.0058807 0.0027856 0.0041956 0.0020978 0.0037829 0.0024761 0.0057776 0.0016163 0.0015820 0.0006878 0.0026137 0.0024073 0.0037485 0.0033015 0.0020978 0.0024417 0.0033359 0.0021666 0.0031639 0.0023729 0.0058120 0.0053305 0.0058463 0.0084944 0.0043332 0.0058120 0.0052617 0.0037829 0.0020634 0.0037829 0.0015820 0.0031983 0.0036798 0.0030951 0.0015820 0.0027512 0.0038517 0.0066717 0.0052617 0.0059151 0.0025105 0.0033359 0.0041956 0.0050210 0.0027856 0.0058807 0.0031295 0.0051929 0.0055024 0.0057432 0.0030607 0.0019602 0.0036798 0.0052273 0.0064654 0.0064654 0.0022698 0.0037485 0.0040237 0.0045739 0.0024073 0.0062934 0.0035422 0.0039549 0.0027856 0.0032327 0.0015820 0.0015820 0.0020634 0.0030263 0.0029920 0.0024073 0.0017883 0.0015820 0.0029576 0.0035422 0.0022698 0.0034734 0.0019259 0.0029920 0.0025105 0.0024417 0.0024761 0.0017539 0.0043332 0.0046771 0.0069468 0.0040581 0.0017539 0.0023729 0.0029232 0.0034390 0.0015820 0.0020978 0.0016163 0.0014788 0.0019602 0.0016851 0.0013756 0.0039205 0.0027512 0.0037141 0.0051585 0.0056400 0.0029920 0.0024417 0.0067749 0.0045051 0.0039549 0.0039893 0.0040581 0.0056400 0.0051929 0.0037829 0.0047459 0.0014788 0.0031983 0.0017883 0.0050210 0.0070500 0.0035422 0.0024073 0.0048490 0.0031639 0.0045739 0.0032327 0.0049866 0.0045051 0.0050210 0.0026481 0.0024417 0.0034390 0.0037829 0.0027856 0.0061215 0.0029576 0.0024073 0.0016163 0.0041268 0.0070500 0.0064654 0.0055712 0.0048146 0.0056400 0.0059151 0.0049522 0.0062934 0.0040581 0.0084944 0.0080473 0.0125181
答案 1 :(得分:2)
这似乎也有效(将此代码保存为transpose.awk
):
/^# +s21_contig[0-9]+/ {
if (source) print_results()
source = $2
}
/^[ACGT]+ / {
if (!($1 in key))
{
key[$1] = 1
seq[++nkeys] = $1
}
obs[$1] = $3
}
END { print_results() }
function print_results( i)
{
if (printed_header == 0)
{
pad = " "
for (i = 1; i <= nkeys; i++)
{
printf "%s%s", pad, seq[i]
pad = " "
}
printf "\n"
printed_header++
}
printf "%s ", source
for (i = 1; i <= nkeys; i++)
printf " %-9s", obs[seq[i]]
printf "\n"
delete obs
}
将脚本运行为:
awk -f transpose.awk data
关于给定数据:
AAAA AAAC AAAG AAAT AACA AACC AACG AACT AAGA AAGC AAGG AAGT AATA AATC AATG AATT ACAA ACAC ACAG ACAT ACCA ACCC ACCG ACCT ACGA ACGC ACGG ACGT ACTA ACTC ACTG ACTT AGAA AGAC AGAG AGAT AGCA AGCC AGCG AGCT AGGA AGGC AGGG AGGT AGTA AGTC AGTG AGTT ATAA ATAC ATAG ATAT ATCA ATCC ATCG ATCT ATGA ATGC ATGG ATGT ATTA ATTC ATTG ATTT CAAA CAAC CAAG CAAT CACA CACC CACG CACT CAGA CAGC CAGG CAGT CATA CATC CATG CATT CCAA CCAC CCAG CCAT CCCA CCCC CCCG CCCT CCGA CCGC CCGG CCGT CCTA CCTC CCTG CCTT CGAA CGAC CGAG CGAT CGCA CGCC CGCG CGCT CGGA CGGC CGGG CGGT CGTA CGTC CGTG CGTT CTAA CTAC CTAG CTAT CTCA CTCC CTCG CTCT CTGA CTGC CTGG CTGT CTTA CTTC CTTG CTTT GAAA GAAC GAAG GAAT GACA GACC GACG GACT GAGA GAGC GAGG GAGT GATA GATC GATG GATT GCAA GCAC GCAG GCAT GCCA GCCC GCCG GCCT GCGA GCGC GCGG GCGT GCTA GCTC GCTG GCTT GGAA GGAC GGAG GGAT GGCA GGCC GGCG GGCT GGGA GGGC GGGG GGGT GGTA GGTC GGTG GGTT GTAA GTAC GTAG GTAT GTCA GTCC GTCG GTCT GTGA GTGC GTGG GTGT GTTA GTTC GTTG GTTT TAAA TAAC TAAG TAAT TACA TACC TACG TACT TAGA TAGC TAGG TAGT TATA TATC TATG TATT TCAA TCAC TCAG TCAT TCCA TCCC TCCG TCCT TCGA TCGC TCGG TCGT TCTA TCTC TCTG TCTT TGAA TGAC TGAG TGAT TGCA TGCC TGCG TGCT TGGA TGGC TGGG TGGT TGTA TGTC TGTG TGTT TTAA TTAC TTAG TTAT TTCA TTCC TTCG TTCT TTGA TTGC TTGG TTGT TTTA TTTC TTTG TTTT
s21_contig00001 0.0115837 0.0060850 0.0061659 0.0069745 0.0045890 0.0022844 0.0064893 0.0022035 0.0044879 0.0068532 0.0039623 0.0034165 0.0026079 0.0045688 0.0057817 0.0039623 0.0042656 0.0018396 0.0020822 0.0033761 0.0016173 0.0014555 0.0043869 0.0010512 0.0065095 0.0040634 0.0050944 0.0040836 0.0007076 0.0015162 0.0017588 0.0034165 0.0031941 0.0018396 0.0020216 0.0016981 0.0046497 0.0037399 0.0070756 0.0044071 0.0029111 0.0029920 0.0022035 0.0010512 0.0012938 0.0017790 0.0021227 0.0022035 0.0027494 0.0020216 0.0012938 0.0031132 0.0048922 0.0034771 0.0087131 0.0016981 0.0062871 0.0046497 0.0043060 0.0033761 0.0022237 0.0033558 0.0043666 0.0069745 0.0079246 0.0041645 0.0054987 0.0043666 0.0016375 0.0026483 0.0028100 0.0021227 0.0011523 0.0061254 0.0013545 0.0017588 0.0025674 0.0065904 0.0036793 0.0057817 0.0043464 0.0017588 0.0020216 0.0043060 0.0021429 0.0027291 0.0042858 0.0022035 0.0055796 0.0077225 0.0059435 0.0050944 0.0007278 0.0030930 0.0013545 0.0039623 0.0066308 0.0064489 0.0048720 0.0087131 0.0049933 0.0094004 0.0072373 0.0070756 0.0057211 0.0099462 0.0042858 0.0043869 0.0027494 0.0077023 0.0028100 0.0064893 0.0008895 0.0008491 0.0002426 0.0012938 0.0026483 0.0032345 0.0048720 0.0020216 0.0028909 0.0033963 0.0020216 0.0020822 0.0012332 0.0058222 0.0054987 0.0061659 0.0080661 0.0042656 0.0058222 0.0033558 0.0038006 0.0026685 0.0077023 0.0017790 0.0023653 0.0058020 0.0030930 0.0015162 0.0027696 0.0048518 0.0065904 0.0045688 0.0069543 0.0030526 0.0033963 0.0046497 0.0052561 0.0037602 0.0099462 0.0029920 0.0074192 0.0095015 0.0077225 0.0040634 0.0010917 0.0058020 0.0061254 0.0068532 0.0059637 0.0027898 0.0032345 0.0034771 0.0050540 0.0037602 0.0094004 0.0037399 0.0034165 0.0037602 0.0027291 0.0014555 0.0009097 0.0026685 0.0026483 0.0022844 0.0018801 0.0017386 0.0008491 0.0020216 0.0048720 0.0027898 0.0064489 0.0018396 0.0025674 0.0030526 0.0017588 0.0018396 0.0010512 0.0042656 0.0041645 0.0060850 0.0032345 0.0010512 0.0012332 0.0022237 0.0015364 0.0009097 0.0027494 0.0012938 0.0007480 0.0010917 0.0007278 0.0007076 0.0012130 0.0027696 0.0025674 0.0026079 0.0063882 0.0025674 0.0028909 0.0062871 0.0034165 0.0034165 0.0057211 0.0029111 0.0071564 0.0074192 0.0055796 0.0065095 0.0007480 0.0023653 0.0011523 0.0044879 0.0057211 0.0048720 0.0026483 0.0048922 0.0033558 0.0050540 0.0049933 0.0046497 0.0034165 0.0052561 0.0021429 0.0016173 0.0015364 0.0038006 0.0016375 0.0045890 0.0022237 0.0018801 0.0008895 0.0027494 0.0057211 0.0059637 0.0066308 0.0031941 0.0063882 0.0069543 0.0043464 0.0042656 0.0032345 0.0080661 0.0079246 0.0115837
s21_contig00002 0.0125181 0.0069468 0.0058463 0.0078066 0.0061215 0.0029920 0.0057776 0.0028200 0.0050210 0.0064654 0.0048834 0.0029920 0.0051585 0.0052617 0.0055024 0.0048146 0.0062934 0.0024761 0.0031639 0.0041956 0.0024417 0.0015820 0.0041956 0.0014444 0.0047459 0.0030607 0.0035078 0.0028200 0.0013756 0.0015820 0.0022010 0.0029920 0.0048146 0.0019259 0.0020978 0.0026481 0.0049866 0.0035422 0.0058463 0.0029576 0.0040581 0.0031295 0.0028888 0.0014444 0.0016163 0.0015820 0.0021322 0.0028200 0.0041268 0.0029576 0.0026137 0.0058463 0.0048490 0.0040237 0.0070156 0.0026481 0.0067749 0.0041956 0.0050554 0.0041956 0.0029232 0.0052617 0.0047459 0.0078066 0.0080473 0.0046771 0.0053305 0.0047459 0.0027856 0.0030263 0.0024761 0.0021322 0.0017883 0.0052273 0.0018915 0.0022010 0.0037141 0.0066717 0.0043332 0.0055024 0.0049522 0.0024417 0.0021666 0.0050554 0.0026481 0.0032327 0.0027856 0.0028888 0.0037829 0.0057432 0.0037829 0.0035078 0.0016851 0.0030951 0.0018915 0.0048834 0.0055712 0.0034734 0.0033015 0.0070156 0.0032327 0.0062934 0.0041268 0.0058463 0.0039893 0.0058807 0.0027856 0.0041956 0.0020978 0.0037829 0.0024761 0.0057776 0.0016163 0.0015820 0.0006878 0.0026137 0.0024073 0.0037485 0.0033015 0.0020978 0.0024417 0.0033359 0.0021666 0.0031639 0.0023729 0.0058120 0.0053305 0.0058463 0.0084944 0.0043332 0.0058120 0.0052617 0.0037829 0.0020634 0.0037829 0.0015820 0.0031983 0.0036798 0.0030951 0.0015820 0.0027512 0.0038517 0.0066717 0.0052617 0.0059151 0.0025105 0.0033359 0.0041956 0.0050210 0.0027856 0.0058807 0.0031295 0.0051929 0.0055024 0.0057432 0.0030607 0.0019602 0.0036798 0.0052273 0.0064654 0.0064654 0.0022698 0.0037485 0.0040237 0.0045739 0.0024073 0.0062934 0.0035422 0.0039549 0.0027856 0.0032327 0.0015820 0.0015820 0.0020634 0.0030263 0.0029920 0.0024073 0.0017883 0.0015820 0.0029576 0.0035422 0.0022698 0.0034734 0.0019259 0.0029920 0.0025105 0.0024417 0.0024761 0.0017539 0.0043332 0.0046771 0.0069468 0.0040581 0.0017539 0.0023729 0.0029232 0.0034390 0.0015820 0.0020978 0.0016163 0.0014788 0.0019602 0.0016851 0.0013756 0.0039205 0.0027512 0.0037141 0.0051585 0.0056400 0.0029920 0.0024417 0.0067749 0.0045051 0.0039549 0.0039893 0.0040581 0.0056400 0.0051929 0.0037829 0.0047459 0.0014788 0.0031983 0.0017883 0.0050210 0.0070500 0.0035422 0.0024073 0.0048490 0.0031639 0.0045739 0.0032327 0.0049866 0.0045051 0.0050210 0.0026481 0.0024417 0.0034390 0.0037829 0.0027856 0.0061215 0.0029576 0.0024073 0.0016163 0.0041268 0.0070500 0.0064654 0.0055712 0.0048146 0.0056400 0.0059151 0.0049522 0.0062934 0.0040581 0.0084944 0.0080473 0.0125181
代码按照他们遇到的顺序打印[ACGT]代码序列,并根据需要添加新值。如果其中一个源缺少值,则它将在输出中显示为空白字段。标题行对应于第一组数据末尾的[ACGT]代码序列列表;代码永远不会尝试再次打印标题。
答案 2 :(得分:1)
一次性完成这项挑战是一项挑战。这个脚本实现了这一点。输出格式设置为与提供的样本匹配,但可以轻松调整。下面的内容注释解释了脚本操作。
#!/bin/bash
[ -f "$1" ] || {
printf "\n Error: insufficient input, file '%s' not found.\n\n" "${0//*\//}"
exit 1
}
## this script requires the header row to be equal for each sequence
key="${2:-s21}" # key to identify sequence ( 3 chars ) default "s21"
currentseq="" # variable to hold sequence
declare -i needhdr=0 # flag to control print header
declare -i seqcnt=0 # sequence count
declare -a obsfarray # array to hold Obs Frequency
## make single pass through data file
while read -r word obscnt obsfreq expfreq oefreq || [ -n "$word" ]; do
## capture inputseq from obscnt
if [ "z${obscnt:0:3}" = "z${key}" ]; then
# if sequence count > 0 headers is already printed and ready to print data
if [ $seqcnt -gt 0 ]; then
needhdr=1 # set need header to false
printf "\n%s" "$inputseq" # print newline followed by input sequence
for i in ${obsfarray[@]}; do # print the Obs Frequency values
printf " $i"
done
unset obsfarray # unset the array for next sequence
fi
inputseq="${obscnt}" # set the inputseq valued from obscnt
((seqcnt++)) # increment the seqcnt
fi
## print header, capture obsfreq values
# test that first char is A C G T
if [ "z${word:0:1}" = "zA" ] || [ "z${word:0:1}" = "zC" ] ||
[ "z${word:0:1}" = "zG" ] || [ "z${word:0:1}" = "zT" ]; then
if [ "z${word:1:1}" != "zo" ]; then # get rid of pesky 'Total'
[ $needhdr -eq 0 ] && printf " %s" "$word" # print header
obsfarray+=( "$obsfreq" ) # fill Obs Frequency array
fi
fi
currentseq="$inputseq" # keep current seq to test for new value
done <"$1"
# print final sequence and Obs Frequency array
printf "\n%s" "$inputseq"
for i in ${obsfarray[@]}; do
printf " $i"
done
unset obsfarray
exit 0
输出(每行显示5个值):
$ ./dna.sh dat/dna.dat
AAAA AAAC AAAG AAAT AACA <snip>
s21_contig00001 0.0115837 0.0060850 0.0061659 0.0069745 0.0045890 <snip>
s21_contig00002 0.0125181 0.0069468 0.0058463 0.0078066 0.0061215 <snip>