大家好,谢谢你的帮助!!!
我有一个分类分配列表(qiime列表),我想使用txt文件中的列表提取细菌列表;我有这个代码,提出了2个问题,一个是分类文件有两行以#开头(qiime列表),第一个(#bions文件中的#constructed)是我要避免的那个,第二个一个(#OTU ID ..... samples)是我用作每列名称的那个,我尝试了类似的东西
if ($_=~ m/Constructed from biom file/){ next; }
在代码的不同位置,但它只是不起作用(但是当我从文件中删除该行时,它运行良好),但重点是我不想操纵文件,即为什么我要避免那条线。
第二个问题(代码的一部分,我不知道如何制作),脚本有-s选项,这意味着如果我想要一个特定的列,脚本只提取那些我指出(例如:-s sample1,sample2,sample..n),问题是有些时候所选样本在任何给定行的所有列中都有cero(0)的值(xRow 0.0 0.0 0.0),其中在必须避免的情况下,在脚本中我使用$ val [1],但它仅用于一个样本,但如果它的2个或更多个样本它必须避免如果在所有行中存在cero(在这种情况下为3个样本)。
脚本:
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper qw(Dumper);
use List::MoreUtils qw(uniq);
use List::Util qw(sum);
my ($search_label, $infile_taxon, $infile_list, $output_file);
GetOptions (
'i=s' =>\$infile_taxon,
'l=s' =>\$infile_list,
's=s' =>\$search_label,
'o=s' =>\$output_file,
);
sub match_genera {
my ($List_File, $Taxon_File) = @_;
my @lista_genera = @{ $List_File }; # dereferencing and copying each array
my @taxon_qiime = @{ $Taxon_File };
my (@extract);
foreach (@lista_genera){
my $unit = $_;
chomp $unit;
my @match = grep (/$unit/, @taxon_qiime);
push (@extract, @match);
}
return @extract;
}
open INFILE_TAXONOMY, '<', "$infile_taxon" or die $!;
open LIST_BACTERIA, '<', "$infile_list" or die $!;
open OUTPUT, '>', "xfile2.txt" or die $!;
chomp ( my @sample_names = split '\t', <INFILE_TAXONOMY> );
shift @sample_names;
unshift @sample_names, '#Genera';
my (@ToExtract, @no_match, @filter, @filter_columns);
# si ingresamos un -s
if ($search_label){
my @wanted= split (/\,/, $search_label);
unshift @wanted, '#Genera';
@wanted = uniq (@wanted);
foreach my $wanted_in (@wanted){
if (my @match = grep (/$wanted_in/, @sample_names)) {
push (@ToExtract, @match);
}
else {
push (@no_match, $wanted_in);
}
}
if (grep {defined($_)} @no_match){
print "\nSamples No Found: @no_match\n\n";
}
print OUTPUT join "\t", @ToExtract, "\n";
}
# si queremos toda la tabla !!
else{
@ToExtract = @sample_names;
print OUTPUT join "\t", @ToExtract, "\n";
}
# Extraer los varoles de cada linea para cada variable !!!
#my %row;
while ( <INFILE_TAXONOMY> ){
my %row;
@row{@sample_names} = split "\t";
@filter= join "\t", @row{@ToExtract};
push (@filter_columns, @filter); # ahora ya se puede usar con la
}
my @list;
foreach (<LIST_BACTERIA>){
if ($_=~ m/^#|^$/) { next; }
else { push @list, $_ }
}
my @filter_list = uniq (@list);
my @last = match_genera (\@filter_list, \@filter_columns);
my (@genera_taxon, @genera_final);
foreach (@last){
@genera_taxon = ($_ =~ m/;D_5__(\w.*)/g);
foreach (@genera_taxon){
if ($_ =~ m/^$/g) { next; }
else { push @genera_final, $_; }
}
}
foreach (@genera_final){ #split the line in a multiples var !!!
chomp;
my @val = split (/\t/, $_);
foreach ($val[1]){
if ($_ == 0){ next; }
else{print join "\t", @val,"\n";}
}
}
close INFILE_TAXONOMY;
close LIST_BACTERIA;
close OUTPUT;
exit;
taxonomic文件(qiime列表)是制表符分隔文本!!!:
#Constructed from biom file
#OTU ID sample1 sample2 sample3
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__ABS-19;D_5__uncultured bacterium 0.002804 0.0073441109 0.0
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__CA002;D_5__uncultured bacterium 0.0 0.001109 0.0
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 10;D_4__Sva0725;D_5__uncultured bacterium 0.0 0.00882217 0.0014038202
D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 7;D_4__uncultured bacterium;D_5__ 0.0 0.0 0.00898876404
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 13;Ambiguous_taxa;D_4__;D_5__ 0.0 0.0 0.00140449438202
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 2;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00280898876404
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 21;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00421348314607
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 22;D_3__uncultured bacterium;D_4__;D_5__ 0.0 0.0 0.00421348314607
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 22;D_3__uncultured prokaryote;D_4__;D_5__ 0.0 0.0 0.0014038202
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 25;D_3__uncultured Acidobacteria bacterium;D_4__;D_5__ 0.0012041933 0.0 0.0
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 5;D_3__uncultured bacterium;D_4__;D_5__ 0.00120401933 0.0 0.0
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 6;D_3__uncultured Acidobacteria bacterium;D_4__;D_5__ 0.0 0.00115473441109 0.0
D_0__Bacteria;D_1__Acidobacteria;D_2__Subgroup 6;D_3__uncultured bacterium;D_4__;D_5__ 0.00180614087899 0.0 0.00280898876404
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__OM1 clade;D_5__uncultured actinobacterium 0.0 0.0 0.00140449438202
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__OM1 clade;D_5__uncultured bacterium 0.0 0.0 0.00561797752809
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__Sva0996 marine group;D_5__uncultured bacterium 0.0 0.0 0.00280898876404
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__uncultured;D_5__uncultured actinobacterium 0.00301023479831 0.00115473441109 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Acidimicrobiales;D_4__uncultured;D_5__uncultured bacterium 0.000602059663 0.001173441109 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Dietziaceae;D_5__Dietzia 0.0150511739916 0.0311778290993 0.00140449438202
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Mycobacteriaceae;D_5__Mycobacterium 0.00240818865 0.002309882217 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Gordonia 0.0 0.0 0.00140449438202
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Rhodococcus 0.00240865 0.0013441109 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Nocardiaceae;D_5__Williamsia 0.0 0.0 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Tsukamurellaceae;D_5__Tsukamurella 0.000020463 0.0 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__nbr16a11;D_5__uncultured bacterium 0.0014093 0.001134411 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Kineosporiales;D_4__Kineosporiaceae;D_5__Quadrisphaera 0.0 0.0014734 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Demequinaceae;D_5__Lysinimicrobium 0.00120409391933 0.0 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Intrasporangiaceae;D_5__Ornithinimicrobium 0.0006959663 0.0 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Intrasporangiaceae;D_5__Tetrasphaera 0.0 0.00441109 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Micrococcaceae;D_5__Glutamicibacter 0.0 0.0031408776 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micrococcales;D_4__Micrococcaceae;D_5__Pseudarthrobacter 0.0 0.002882217 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Micromonosporales;D_4__Micromonosporaceae;D_5__Actinoplanes 0.0 0.0011441109 0.0
D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Propionibacteriales;D_4__Propionibacteriaceae;D_5__Propionibacterium 0.3479831 0.0882217 0.00280898876404
列表:
#list
Gordonia
Mycobacterium
Ornithinimicrobium
Marinobacter
Pseudoalteromonas
Pseudomonas
Halomonas
Alcanivorax
Acinetobacter
Shewanella
Pseudidiomarina
Microbulbifer
Bacillus
Microbacterium
Achrornobacter
Actinomyces
Alcaligenes
打印出预期(制表符分隔):
#genera sample1 sample3
Gordonia 0.00301023479831 0.00140449438202
Mycobacterium 0.00240818783865 0.0
Ornithinimicrobium 0.000602046959663 0.0
Pseudomonas 0.367850692354 0.254213483146
Halomonas 0.000602046959663 0.00140449438202
Acinetobacter 0.00301023479831 0.00561797752809
Bacillus 0.0626128838049 0.00280898876404
Klebsiella 0.0138470800722 0.00280898876404
Lactobacillus 0.000602046959663 0.0
Acinetobacter 0.00301023479831 0.00561797752809
Gordonia 0.00301023479831 0.00140449438202
Rhodococcus 0.00240818783865 0.0
Williamsia 0.000602046959663 0.0
Streptomyces 0.000602046959663 0.0
Dietzia 0.0150511739916 0.00140449438202
Aquabacterium 0.000602046959663 0.0
Janthinobacterium 0.0180614087899 0.0294943820225
Massilia 0.000602046959663 0.00140449438202
Noviherbaspirillum 0.000602046959663 0.0
Rhodococcus 0.00240818783865 0.0
Staphylococcus 0.166164960867 0.0688202247191
Haemophilus 0.00120409391933 0.00280898876404
Stenotrophomonas 0.000602046959663 0.00140449438202
Candidatus Endomicrobium 0.00662251655629 0.0
Candidatus Hepatincola 0.000602046959663 0.0
最后一个,我尝试使代码更简单,在新脚本中我只是尝试编辑部分代码,尝试使用@newarray而不是&lt; INFILE_TAXONOMY&gt;为了选择列,在新数组中,列表几乎完成了我想要的(如预期的输出),它只是选择想要的列,如何使用数组生成下一段代码。
while ( <INFILE_TAXONOMY> ){
my %row;
@row{@sample_names} = split "\t";
@filter= join "\t", @row{@wanted};
push (@filter_columns, @filter);
}
新脚本没有选择特定列的选项!!!
use strict;
use warnings;
use List::MoreUtils qw(uniq);
use Data::Dumper qw(Dumper);
#---------------------------Subrutina de extraccion para 2 arrays de lista de bacterias y taxones!!-------------------------------------------------------------
sub match_genera {
my ($List_File, $Taxon_File) = @_;
my @taxon_qiime = @{ $Taxon_File };
my @lista_genera = @{ $List_File }; # dereferencing and copying each array
my (@match, @extract, @genera_clean);
foreach (@taxon_qiime){
my @generas_taxon = ($_ =~ m/;D_5__(\w.*)/g);
foreach (@generas_taxon){
if ($_ =~ m/^$/g) { next; }
else { push @genera_clean, $_; }
}
}
foreach (@lista_genera){
my $list_unit = $_;
chomp $list_unit;
@match = grep (/$list_unit/, @genera_clean);
push (@extract, @match);
}
return @extract;
}
#------------------------------------------------------FILES-----------------------------------------------------------------------
open INFILE_TAXONOMY, '<', "otu_table_L6_copy.txt" or die $!;
open LIST_BACTERIA, '<', "lista_degradadoras.txt" or die $!;
my (@lista_bacteria, @taxon, @sample_names);
# -------------------------------------------------------------TAXON -------------------------------------------------------------
foreach (<INFILE_TAXONOMY>){
chomp;
if ($_=~ m/^$|Constructed from biom file/g) { next; }
elsif ($_ =~ s/OTU ID/Genera/g) { push @sample_names, $_; }
else { push (@taxon, $_); }
}
# -------------------------------------------------------------LIST -------------------------------------------------------------
foreach (<LIST_BACTERIA>){
if ($_=~ m/^$|^#/g) { next; }
else { push @lista_bacteria, $_ }
}
my @filter_list = uniq (@lista_bacteria);
# -------------------------------------------------------------------------------------------------------------------------------
my @match_all = match_genera (\@filter_list, \@taxon);
unshift @match_all, @sample_names;
@wanted= qw(sample1 sample3);
foreach (@match_all){
print "$_\n";
}
close INFILE_TAXONOMY;
close LIST_BACTERIA;
exit;
答案 0 :(得分:0)
替换
chomp ( my @sample_names = split '\t', <INFILE_TAXONOMY> );
与
<INFILE_TAXONOMY>; # Skip first line.
chomp ( my @sample_names = split '\t', <INFILE_TAXONOMY> );
或
my $sample_names;
while ($sample_names = <INFILE_TAXONOMY>) {
chomp($sample_names);
last if $sample_names !~ /Constructed from biom file/;
}
defined($sample_names)
or die("Premature EOF");
my @sample_names = split /\t/, $sample_names;