Question

我的代码循环遍历目录中的多个文件，解析每个文件并将每个文件的已解析内容追加到FinalVariantfile.txt。

代码有效，但会复制每个文件的内容。

当我用两个文件运行代码时，输出包含4个文件。有人可以解释为什么会发生这种情况以及如何解决这个问题？

    #!/usr/bin/perl -w

    use strict;

    #directory structure

    my $home         = "/data/";
    my $tsvdirectory = $home . "test_all_runs/" . $ARGV[0];
    my $tsvfiles     = $home . "test_all_runs/" . $ARGV[0] . "/tsv_files.txt";

    my $FinalVariants = $home . "test_all_runs/" . $ARGV[0] . "/FinalVariantfile.txt";

    my @tsvfiles        = ();
    my @currentlines    = ();
    my $currentline     = '';
    my $currentCNVline  = '';
    my @currentCNVlines = ();
    my @HotSpotLines    = ();
    my @CNVLines        = ();

    # command to produce the vcf_files.txt file stored in each individual run
    # directory; the file list includes solely vcf files which have not been
    # previously prepared and/or annotated
    my $cmd = `ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles`;

    # print "$cmd";
    my $cmda = "ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles";

    # print "$cmda";

    # this code opens the vcf_files.txt file and passes each line into an array for
    # indidivudal manipulation
    open( TXT2, "$tsvfiles" );
    while ( <TXT2> ) {
        push( @tsvfiles, $_ );
    }
    close(TXT2);

    foreach ( @tsvfiles ) {
        chop($_);
    }

    # this code then parses each of the files listed by name in the tsvfiles array
    foreach ( @tsvfiles ) {

        my $currenttsvfile = "$_";    # establishes the current file being manipulated

        my $MDLfinaltsvfile = $currenttsvfile;
        $MDLfinaltsvfile =~ s/\.tsv/_prepared\.txt/g;

        # this series of variable calls names the various intermediate or
        # final output files

        my $MDLlinestsvfile = $currenttsvfile;
        $MDLlinestsvfile =~ s/\.tsv/_withCNV\.txt/g;

        my $Variantlinestsvfile = $currenttsvfile;
        $Variantlinestsvfile =~ s/\.tsv/_HotSpot\.txt/g;

        my $MDLtsvfile = $currenttsvfile;
        $MDLtsvfile =~ s/\.tsv/_FilteredAllcolumns\.txt/g;

        my $MDLsampleid = $currenttsvfile;
        $MDLsampleid =~ s/\-oncogene.tsv//g;
        print "The currentVCFis############# " . $currenttsvfile . "\n";

        my @SampleID = ();
        @SampleID = split /\//, $MDLsampleid;
        print "The sampleIDis##############" . $SampleID[4] . "\n";

        my $CNVdata = $currenttsvfile;
        $CNVdata =~ s/\.tsv/_cnv\.txt/g;

        my $FinalCNVdata = $currenttsvfile;
        $FinalCNVdata =~ s/\.tsv/_finalcnv\.txt/g;

        my $cmd2 = `fgrep -v "#" $currenttsvfile > $MDLlinestsvfile`;
        print "$cmd2";    # this code extracts from the current vcf file all of the
                          # lines of data and outputs them into a separate file

        my $cmd5 = `grep -vwE "(CNV|intronic|synonymous|utr_3|utr_5)" 
#removes lines that contain CNV/intronic/synonymous/utr_3/utr_5"

$MDLlinestsvfile > $Variantlinestsvfile`;
        print "$cmd5";

        open( my $fh_in, '<', $Variantlinestsvfile )
                or die "cannot open $Variantlinestsvfile: $!\n"; 
#removes lines that contain 0/0 and ./. genotypes from field 70.

        open( my $fh_out, '>', $MDLtsvfile )
                or die "cannot open $MDLtsvfile: $!\n";

        while ( my $line = <$fh_in> ) {

            # tab/field-based:
            my @fields = split( /\s+/, $line );
            print $fh_out $line unless ( $fields[70] =~ m|([0.])/\1| );
        }
        close($fh_in);
        close($fh_out);

        #open each filtered file with all columns and pushes it into array.
        open( TXT2, "$MDLtsvfile" );
        while (<TXT2>) {
            push( @HotSpotLines, $_ );
        }
        close(TXT2);

        foreach (@HotSpotLines) {
            chop($_);

            my @HotSpotEntries = ();
            my $currentMDLline = $_;
            @HotSpotEntries = split( /\t/, $currentMDLline );

            my $chr        = $HotSpotEntries[9];
            my $position   = $HotSpotEntries[10];
            my $cosmicids  = $HotSpotEntries[21];
            my $refforward = $HotSpotEntries[67];
            my $genotype   = $HotSpotEntries[70];
            my $altforward = $HotSpotEntries[77];
            my $altreverse = $HotSpotEntries[78];
            my $cDNA       = $HotSpotEntries[81];
            my $exon       = $HotSpotEntries[83];
            my $conseq     = $HotSpotEntries[84];
            my $location   = $HotSpotEntries[88];
            my $geneclass  = $HotSpotEntries[92];
            my $aachange   = $HotSpotEntries[98];
            my $transcript = $HotSpotEntries[100];

            $currentline
                    = $SampleID[4] . "\t"
                    . $chr . "\t"
                    . $position . "\t"
                    . $cosmicids . "\t"
                    . $refforward . "\t"
                    . $refreverse . "\t"
                    . $genotype . "\t"
                    . $altforward . "\t"
                    . $altreverse . "\t"
                    . $cDNA . "\t"
                    . $exon . "\t"
                    . $conseq . "\t"
                    . $location . "\t"
                    . $geneclass . "\t"
                    . $aachange . "\t"
                    . $transcript;

            # print "The currentVCFlineis ".$currentline."\n";
            push( @currentlines, $currentline );

        }

        my $i;

        for ( $i = 0; $i < @currentlines; $i += 1 ) {

            my $currentguiline = $currentlines[$i];

            my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;
            print "$cmd5";

            #my $cmd9 = `sed -i '1i$SampleID[4]' $FinalVariants`; print $cmd9;
        }
    }

Answer 1

没有必要启动这么多新的shell子进程来执行这样的基本操作。 ls，fgrep，grep和echo在Perl中都具有等价物，特别是对每行文本调用echo是一种非常糟糕的复制方式提交给另一个

我怀疑你的问题是因为行

my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;

将追加 @currentlines的每个元素到文件的末尾。因此，第一次运行程序时，它将包含结果的单个副本，但每次后续运行只会在文件末尾添加更多数据，并且会不断增长

我讨厌提供一个让事情变得有效的黑客攻击，但是我需要花费很多时间才能理解你的程序在所有混乱背后所做的事情，并编写一个简洁的版本。您可以通过添加行

临时修复它

unlink $FinalVariants or die $!;

在foreach ( @tsvfiles ) { ... }循环之前

。这将删除文件并确保为程序的每次执行创建新版本。

好的，我仔细研究了你的代码，我认为这会做你想要的。没有任何数据甚至文件名样本我都无法测试它，除了确保它编译，所以如果它第一次工作将是一个奇迹，但我相信这是你得到一个连贯解决方案的最好机会< / p>

请注意，您在自己的代码中使用的$refreverse存在问题，但从未声明或定义它，因此您显示的代码无法创建您说它的问题，因为它在编译期间死亡错误消息

Global symbol "$refreverse" requires explicit package name

我猜测它正好在$ref_forward之后的索引68

请报告此功能的好坏

#!/usr/bin/perl

use strict;
use warnings 'all';

my $home          = "/data";
my $tsv_directory = "$home/test_all_runs/$ARGV[0]";

my $final_variants = "$tsv_directory/final_variant_file.txt";

open my $out_fh, '>', $final_variants
        or die qq{Unable to open "$final_variants" for output: $!};

my @tsv_files = glob "$tsv_directory/FOCUS*.tsv";

for my $tsv_file ( @tsv_files ) {

    print "The current VCF is ############# $tsv_file\n";

    $tsv_file =~ m|([^/]+)-oncogene.tsv$| or die "Cant extract Sample ID";
    my $sample_id = $1;
    print "The sample ID is ############## $sample_id\n";

    open my $in_fh, '<', $tsv_file
            or die qq{Unable to open "$tsv_file" for input: $!};

    while ( <$in_fh> ) {

        next if /^#/;
        next if /\b(?:CNV|intronic|synonymous|utr_3|utr_5)\b/;

        my @fields = split;
        next if $fields[70] eq '0/0' or $fields[70] eq './.';

        my @wanted = ( 9, 10, 21, 67, 68, 70, 77, 78, 81, 83, 84, 88, 92, 98, 100 );
        my $current_line = join "\t", @fields[@wanted];

        print $out_fh $current_line, "\n";
    }
}

每个循环的重复输出

1 个答案: