Perl脚本:组合来自许多文件的数据

时间:2013-12-10 01:47:58

标签: matrix

有许多文本文件包含

2 个答案:

答案 0 :(得分:1)

这是另一种选择。输出将写入STDOUT并按您的要求进行格式化:

#!/usr/bin/perl

use strict;
use warnings;

#Array of file names (these should be read from a separate file or constructed in situ)
#my @files = (701, 702, 703, 704);

my $nfiles = 4; #number of files in your dir change as per your need
my $finit  = 701; #the index of the first file

#Temporary hash and hash reference
my %names;
my $hash_ref;

#Start printing output (here we use STDOUT)
printf STDOUT "classname:";

# loop through files
#foreach my $file_name (@files) {
for (my $i = 0; $i < $nfiles; $i++) { #new loop that uses file names generated in situ
    #generate index of input file
    my $file_name = $finit + $i;

    #Continue print index to output
    printf STDOUT " %s", $file_name;

    #generate actual file name
    $file_name = "$file_name" . ".txt";

    #create handle
    open my ($fh) , '<', $file_name or die $!;

    #read line record
    while (my $rec = <$fh>) {
        #remove trailing end of line
        chomp $rec;

        #remove leading white space
        $rec =~ s/^\s+//;

        #remove trailing white space
        $rec =~ s/\s+$//;

        #skip blank lines
        next unless $rec;

        #split contents of record
        my ($k,$p,$c,$seq_count) = split /;/, $rec;

        #generate kpc_name
        my $kpc_name= "$k;$p;$c;";

        #store
        $hash_ref->{$file_name}{$kpc_name} = $seq_count;
        $names{$kpc_name}++; #this is just for storing the kpc_names seen in the files
                             #if you have the names already available you don't need this.
    }

    #close current file handle
    close($fh);
}

#Add new line to output
printf STDOUT "\n";

#loop through kpc_names (sorted alphabetically)
foreach my $kpc_name (sort keys %names) {
    #Begin construction of output line
    my $line_out = $kpc_name;

    #loop through each file name
    foreach my $file_name (@files) {
        #sprint the count if the file has that kpc_name, 0 otherwise
        $line_out .= sprintf " %s,",
            exists $hash_ref->{$file_name}{$kpc_name} ?
                $hash_ref->{$file_name}{$kpc_name} : 0;
    }

    #Remove extra comma at end of line
    chop $line_out;

    #Print line to output file (here STDOUT)
    print STDOUT "$line_out\n";
}

答案 1 :(得分:0)

试试这个。如果您没有从同一目录运行它,则可以将该目录作为参数传递,否则它将默认为同一目录。

它还默认打印到命令行,可以重定向到您选择的文件。

Ex. perl progname.pl > output.tsv



#!/usr/bin/perl

use strict;
use warnings;

#find files for reading and put into an array

my $dir = shift || "./"; #You can pass argument with directory, or default to same directory as program

opendir(my $dh, $dir) or die "Could not open directory: $!";

my @files;
my @classes;

while(readdir($dh)) {
    if (/70\d50\d_classes.txt/){
        push @files, $_;
    }
}

#find out the classes
open (my $fh, "<", $dir . 'uniqueclasses.txt') or die "Could not open file: $!";

while(<$fh>){
    chomp;
    next unless $_;
    my @line = split /;\s?/;
    push @classes, [join(" ", @line[0,1]), {'class' => $line[2], 'found' => 0}];
}

close $fh;


#go through files and read each line in, only caring about class and count
for my $file(@files){
    open (my $fh, "<", $dir . $file) or die "Could not open file: $!";
    while(<$fh>){
        chomp;
        next unless $_;
        my @line = split /\s/;
        for my $i(0 .. $#classes){
            if($classes[$i][1]->{'class'} eq $line[2]){
                my $key = substr($file, 0,6);
                $classes[$i][1]->{$key} = $line[3];
                $classes[$i][1]->{'found'} = 1;
            }
        }
    }
    close $fh;
}

#Check through array for classes not found in any files
#put it all together for display
#columns
print "\t\t\t\t";

my @columns;
for my $i(0..$#files){
    print "\t", ($columns[$i] = substr($files[$i],0,6));
}

print "\n";

#rows
for my $row (@classes) {
    my %data = %{$row->[1]};
    next unless $data{'found'};
    print "\n", $row->[0] ,"\t", $data{'class'};
    for (@columns){
        if(defined($data{$_})){ #checks if field is there, if so, includes value, if not adds 0
            print "\t$data{$_}";
        }
        else {
            print "\t0";
        }
    }
}