我想基于两个文件之间的匹配字符串组合两个文件。在File1中,字符串位于第4列,在File2中,字符串位于符号":"和" - "
File1中:
chr1 Rfam ncRNA 157784 157887 66.91 - . ID=RF00026.1;Name=RF00026;Alias=U6;Note=AL627309.15/147374-147271
chr1 Rfam ncRNA 564813 564881 36.11 + . ID=RF00005.1;Name=RF00005;Alias=tRNA;Note=AC114498.2/43445-43513
chr1 Rfam ncRNA 564879 564950 32.30 - . ID=RF00005.2;Name=RF00005;Alias=tRNA;Note=AC114498.2/43582-43511
chr1 Rfam ncRNA 564952 565019 28.17 + . ID=RF00005.3;Name=RF00005;Alias=tRNA;Note=AC114498.2/43584-43651
chr1 Rfam ncRNA 566062 566129 31.36 + . ID=RF00005.4;Name=RF00005;Alias=tRNA;Note=AC114498.2/44694-44761
chr1 Rfam ncRNA 566137 566205 30.82 - . ID=RF00005.5;Name=RF00005;Alias=tRNA;Note=AC114498.2/44837-44769
chr1 Rfam ncRNA 566207 566279 35.81 - . ID=RF00005.6;Name=RF00005;Alias=tRNA;Note=AC114498.2/44911-44839
chr1 Rfam ncRNA 566311 566376 26.05 - . ID=RF00005.7;Name=RF00005;Alias=tRNA;Note=AC114498.2/45008-44943
文件2:
chr1:157783-157887 aaaaatatggaatgcttcacaaatttgcatgtcattctttcacagaggccgtgccaatctctctattgttccaacttaagtatgtgtgctactgaggcaagcaT
chr1:564812-564881 AGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGTTTAAATCCCCTTATTTcta
chr1:564878-564950 ctaggactatgagaatcgaacccatccctgagaatccaaaattctccgtgccacctatcacaccccatccta
chr1:564951-565019 AGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATATCCTTCCCGTACTA
chr1:566061-566129 AGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTG
chr1:566136-566205 TAAGGACTGCAAAACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTT
chr1:566206-566279 CTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTA
chr1:566310-566376 AAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCTCAGAGCT
chr1:566376-566441 GGTAAAAAGAGGCTTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACC
chr1:568068-568136 AAGATATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAATTATAGGCTAAATCCTATATATCTTA
chr1:568843-568913 CACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAACACCTCTTTACAGTGA
输出:
输出应包含两个文件中的所有列。
chr1 Rfam ncRNA 157784 157887 66.91 - . ID=RF00026.1;Name=RF00026;Alias=U6;Note=AL627309.15/147374-147271 chr1:157783-157887 aaaaatatggaatgcttcacaaatttgcatgtcattctttcacagaggccgtgccaatctctctattgttccaacttaagtatgtgtgctactgaggcaagcaT
....
....
我之前将此代码编写为类似但不完全相同的问题:
#!/usr/bin/perl
use strict;
use warnings;
my @files = qw| input.log input1.log |; #you can give here path of files, or use @ARGV if you wish to pass files from command line
my %data;
foreach my $filename (@files)
{
open my $fh, '<', $filename or die "Cannot open $filename for reading: $!";
while (my $line = <$fh>)
{
chomp $line;
my ($col1, $col2) = split /\s+/, $line;
push @{ $data{$col1} }, $col2; #create an hash of array
}
}
foreach my $col1 (sort keys %data)
{
print join("\t", $col1, @{ $data{$col1} }), "\n";
}
答案 0 :(得分:1)
这将连接请求的列/数据上的两个文件。如果匹配,它只会打印行。
编辑:显示包含新样本数据的结果:
$ cat f1
chr1 Rfam ncRNA 157783 157887 66.91 - 0 ID=RF00026.1;Name=RF00026;Alias=U6;Note=AL627309.15/147374-147271
chr1 Rfam ncRNA 564812 564881 36.11 + 0 ID=RF00005.1;Name=RF00005;Alias=tRNA;Note=AC114498.2/43445-43513
chr1 Rfam ncRNA 564878 564950 32.3 - 0 ID=RF00005.2;Name=RF00005;Alias=tRNA;Note=AC114498.2/43582-43511
chr1 Rfam ncRNA 564951 565019 28.17 + 0 ID=RF00005.3;Name=RF00005;Alias=tRNA;Note=AC114498.2/43584-43651
chr1 Rfam ncRNA 566061 566129 31.36 + 0 ID=RF00005.4;Name=RF00005;Alias=tRNA;Note=AC114498.2/44694-44761
chr1 Rfam ncRNA 566136 566205 30.82 - 0 ID=RF00005.5;Name=RF00005;Alias=tRNA;Note=AC114498.2/44837-44769
chr1 Rfam ncRNA 566206 566279 35.81 - 0 ID=RF00005.6;Name=RF00005;Alias=tRNA;Note=AC114498.2/44911-44839
chr1 Rfam ncRNA 566310 566376 26.05 - 0 ID=RF00005.7;Name=RF00005;Alias=tRNA;Note=AC114498.2/45008-44943
chr1 Rfam ncRNA 566376 566441 37.46 - 0 ID=RF00005.8;Name=RF00005;Alias=tRNA;Note=AC114498.2/45073-45009
chr1 Rfam ncRNA 568068 568136 31.45 + 0 ID=RF00005.9;Name=RF00005;Alias=tRNA;Note=AC114498.2/46701-46768
$ cat f2
chr1:157783-157887 aaaaatatggaatgcttcacaaatttgcatgtcattctttcacagaggccgtgccaatctctctattgttccaacttaagtatgtgtgctactgaggcaagcaT
chr1:564812-564881 AGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGTTTAAATCCCCTTATTTcta
chr1:564878-564950 ctaggactatgagaatcgaacccatccctgagaatccaaaattctccgtgccacctatcacaccccatccta
chr1:564951-565019 AGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATATCCTTCCCGTACTA
chr1:566061-566129 AGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTG
chr1:566136-566205 TAAGGACTGCAAAACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTT
chr1:566206-566279 CTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTA
chr1:566310-566376 AAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCTCAGAGCT
chr1:566376-566441 GGTAAAAAGAGGCTTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACC
chr1:568068-568136 AAGATATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAATTATAGGCTAAATCCTATATATCTTA
$ awk 'BEGIN {FS="[\t :-]+"} NR==FNR {a[$4]=$0; next} $2 in a {print a[$2], $0}' f1 f2
chr1 Rfam ncRNA 157783 157887 66.91 - 0 ID=RF00026.1;Name=RF00026;Alias=U6;Note=AL627309.15/147374-147271 chr1:157783-157887 aaaaatatggaatgcttcacaaatttgcatgtcattctttcacagaggccgtgccaatctctctattgttccaacttaagtatgtgtgctactgaggcaagcaT
chr1 Rfam ncRNA 564812 564881 36.11 + 0 ID=RF00005.1;Name=RF00005;Alias=tRNA;Note=AC114498.2/43445-43513 chr1:564812-564881 AGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGTTTAAATCCCCTTATTTcta
chr1 Rfam ncRNA 564878 564950 32.3 - 0 ID=RF00005.2;Name=RF00005;Alias=tRNA;Note=AC114498.2/43582-43511 chr1:564878-564950 ctaggactatgagaatcgaacccatccctgagaatccaaaattctccgtgccacctatcacaccccatccta
chr1 Rfam ncRNA 564951 565019 28.17 + 0 ID=RF00005.3;Name=RF00005;Alias=tRNA;Note=AC114498.2/43584-43651 chr1:564951-565019 AGTAAGGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATATCCTTCCCGTACTA
chr1 Rfam ncRNA 566061 566129 31.36 + 0 ID=RF00005.4;Name=RF00005;Alias=tRNA;Note=AC114498.2/44694-44761 chr1:566061-566129 AGAAATTTAGGTTAAATACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTG
chr1 Rfam ncRNA 566136 566205 30.82 - 0 ID=RF00005.5;Name=RF00005;Alias=tRNA;Note=AC114498.2/44837-44769 chr1:566136-566205 TAAGGACTGCAAAACCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTT
chr1 Rfam ncRNA 566206 566279 35.81 - 0 ID=RF00005.6;Name=RF00005;Alias=tRNA;Note=AC114498.2/44911-44839 chr1:566206-566279 CTAGACCAATGGGACTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTA
chr1 Rfam ncRNA 566310 566376 26.05 - 0 ID=RF00005.7;Name=RF00005;Alias=tRNA;Note=AC114498.2/45008-44943 chr1:566310-566376 AAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCTCAGAGCT
chr1 Rfam ncRNA 566376 566441 37.46 - 0 ID=RF00005.8;Name=RF00005;Alias=tRNA;Note=AC114498.2/45073-45009 chr1:566376-566441 GGTAAAAAGAGGCTTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACC
chr1 Rfam ncRNA 568068 568136 31.45 + 0 ID=RF00005.9;Name=RF00005;Alias=tRNA;Note=AC114498.2/46701-46768 chr1:568068-568136 AAGATATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAATTATAGGCTAAATCCTATATATCTTA