
时间:2016-06-22 21:36:00

标签: perl




    #!/usr/bin/perl -w

    use strict;

    #directory structure

    my $home         = "/data/";
    my $tsvdirectory = $home . "test_all_runs/" . $ARGV[0];
    my $tsvfiles     = $home . "test_all_runs/" . $ARGV[0] . "/tsv_files.txt";

    my $FinalVariants = $home . "test_all_runs/" . $ARGV[0] . "/FinalVariantfile.txt";

    my @tsvfiles        = ();
    my @currentlines    = ();
    my $currentline     = '';
    my $currentCNVline  = '';
    my @currentCNVlines = ();
    my @HotSpotLines    = ();
    my @CNVLines        = ();

    # command to produce the vcf_files.txt file stored in each individual run
    # directory; the file list includes solely vcf files which have not been
    # previously prepared and/or annotated
    my $cmd = `ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles`;

    # print "$cmd";
    my $cmda = "ls $tsvdirectory/FOCUS*\.tsv > $tsvfiles";

    # print "$cmda";

    # this code opens the vcf_files.txt file and passes each line into an array for
    # indidivudal manipulation
    open( TXT2, "$tsvfiles" );
    while ( <TXT2> ) {
        push( @tsvfiles, $_ );

    foreach ( @tsvfiles ) {

    # this code then parses each of the files listed by name in the tsvfiles array
    foreach ( @tsvfiles ) {

        my $currenttsvfile = "$_";    # establishes the current file being manipulated

        my $MDLfinaltsvfile = $currenttsvfile;
        $MDLfinaltsvfile =~ s/\.tsv/_prepared\.txt/g;

        # this series of variable calls names the various intermediate or
        # final output files

        my $MDLlinestsvfile = $currenttsvfile;
        $MDLlinestsvfile =~ s/\.tsv/_withCNV\.txt/g;

        my $Variantlinestsvfile = $currenttsvfile;
        $Variantlinestsvfile =~ s/\.tsv/_HotSpot\.txt/g;

        my $MDLtsvfile = $currenttsvfile;
        $MDLtsvfile =~ s/\.tsv/_FilteredAllcolumns\.txt/g;

        my $MDLsampleid = $currenttsvfile;
        $MDLsampleid =~ s/\-oncogene.tsv//g;
        print "The currentVCFis############# " . $currenttsvfile . "\n";

        my @SampleID = ();
        @SampleID = split /\//, $MDLsampleid;
        print "The sampleIDis##############" . $SampleID[4] . "\n";

        my $CNVdata = $currenttsvfile;
        $CNVdata =~ s/\.tsv/_cnv\.txt/g;

        my $FinalCNVdata = $currenttsvfile;
        $FinalCNVdata =~ s/\.tsv/_finalcnv\.txt/g;

        my $cmd2 = `fgrep -v "#" $currenttsvfile > $MDLlinestsvfile`;
        print "$cmd2";    # this code extracts from the current vcf file all of the
                          # lines of data and outputs them into a separate file

        my $cmd5 = `grep -vwE "(CNV|intronic|synonymous|utr_3|utr_5)" 
#removes lines that contain CNV/intronic/synonymous/utr_3/utr_5"

$MDLlinestsvfile > $Variantlinestsvfile`;
        print "$cmd5";

        open( my $fh_in, '<', $Variantlinestsvfile )
                or die "cannot open $Variantlinestsvfile: $!\n"; 
#removes lines that contain 0/0 and ./. genotypes from field 70.

        open( my $fh_out, '>', $MDLtsvfile )
                or die "cannot open $MDLtsvfile: $!\n";

        while ( my $line = <$fh_in> ) {

            # tab/field-based:
            my @fields = split( /\s+/, $line );
            print $fh_out $line unless ( $fields[70] =~ m|([0.])/\1| );

        #open each filtered file with all columns and pushes it into array.
        open( TXT2, "$MDLtsvfile" );
        while (<TXT2>) {
            push( @HotSpotLines, $_ );

        foreach (@HotSpotLines) {

            my @HotSpotEntries = ();
            my $currentMDLline = $_;
            @HotSpotEntries = split( /\t/, $currentMDLline );

            my $chr        = $HotSpotEntries[9];
            my $position   = $HotSpotEntries[10];
            my $cosmicids  = $HotSpotEntries[21];
            my $refforward = $HotSpotEntries[67];
            my $genotype   = $HotSpotEntries[70];
            my $altforward = $HotSpotEntries[77];
            my $altreverse = $HotSpotEntries[78];
            my $cDNA       = $HotSpotEntries[81];
            my $exon       = $HotSpotEntries[83];
            my $conseq     = $HotSpotEntries[84];
            my $location   = $HotSpotEntries[88];
            my $geneclass  = $HotSpotEntries[92];
            my $aachange   = $HotSpotEntries[98];
            my $transcript = $HotSpotEntries[100];

                    = $SampleID[4] . "\t"
                    . $chr . "\t"
                    . $position . "\t"
                    . $cosmicids . "\t"
                    . $refforward . "\t"
                    . $refreverse . "\t"
                    . $genotype . "\t"
                    . $altforward . "\t"
                    . $altreverse . "\t"
                    . $cDNA . "\t"
                    . $exon . "\t"
                    . $conseq . "\t"
                    . $location . "\t"
                    . $geneclass . "\t"
                    . $aachange . "\t"
                    . $transcript;

            # print "The currentVCFlineis ".$currentline."\n";
            push( @currentlines, $currentline );


        my $i;

        for ( $i = 0; $i < @currentlines; $i += 1 ) {

            my $currentguiline = $currentlines[$i];

            my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;
            print "$cmd5";

            #my $cmd9 = `sed -i '1i$SampleID[4]' $FinalVariants`; print $cmd9;

1 个答案:

答案 0 :(得分:3)

没有必要启动这么多新的shell子进程来执行这样的基本操作。 lsfgrepgrepecho在Perl中都具有等价物,特别是对每行文本调用echo是一种非常糟糕的复制方式提交给另一个


my $cmd5 = `echo "$currentguiline" >> $FinalVariants`;

追加 @currentlines的每个元素到文件的末尾。因此,第一次运行程序时,它将包含结果的单个副本,但每次后续运行只会在文件末尾添加更多数据,并且会不断增长


unlink $FinalVariants or die $!;
foreach ( @tsvfiles ) { ... }循环之前


好的,我仔细研究了你的代码,我认为这会做你想要的。没有任何数据甚至文件名样本我都无法测试它,除了确保它编译,所以如果它第一次工作将是一个奇迹,但我相信这是你得到一个连贯解决方案的最好机会< / p>


Global symbol "$refreverse" requires explicit package name




use strict;
use warnings 'all';

my $home          = "/data";
my $tsv_directory = "$home/test_all_runs/$ARGV[0]";

my $final_variants = "$tsv_directory/final_variant_file.txt";

open my $out_fh, '>', $final_variants
        or die qq{Unable to open "$final_variants" for output: $!};

my @tsv_files = glob "$tsv_directory/FOCUS*.tsv";

for my $tsv_file ( @tsv_files ) {

    print "The current VCF is ############# $tsv_file\n";

    $tsv_file =~ m|([^/]+)-oncogene.tsv$| or die "Cant extract Sample ID";
    my $sample_id = $1;
    print "The sample ID is ############## $sample_id\n";

    open my $in_fh, '<', $tsv_file
            or die qq{Unable to open "$tsv_file" for input: $!};

    while ( <$in_fh> ) {

        next if /^#/;
        next if /\b(?:CNV|intronic|synonymous|utr_3|utr_5)\b/;

        my @fields = split;
        next if $fields[70] eq '0/0' or $fields[70] eq './.';

        my @wanted = ( 9, 10, 21, 67, 68, 70, 77, 78, 81, 83, 84, 88, 92, 98, 100 );
        my $current_line = join "\t", @fields[@wanted];

        print $out_fh $current_line, "\n";