从一个目录中的多个pdb文件中解析和提取信息

时间:2013-12-19 07:44:31

标签: perl parsing bioinformatics

我被赋予了从pdb文件中提取数据以进行分析的任务。数据包括标题,标题,x,y,z坐标和有关氨基酸的信息。

我已经编写了一个程序,可以从pdb文件中读取并传输新输出文件所需的信息。我的主要问题是我需要一个脚本,可以读取目录中的500多个pdb文件并收集所需的信息。

这是我原始代码的一部分。它从单个用户输入的pdb文件中提取信息。我希望为目录中的多个文件实现相同的功能。

#count for all amino acids existing in the protein
$count_of_alanine=0;
$count_of_arginine=0;
$count_of_asparagine=0;
$count_of_aspartic_acid=0;
$count_of_cysteine=0;
$count_of_glutamic_acid=0;
$count_of_glutamine=0;
$count_of_glycine=0;
$count_of_histidine=0;
$count_of_isoleucine=0;
$count_of_leucine=0;
$count_of_lysine=0;
$count_of_methionine=0;
$count_of_phenylalanine=0;
$count_of_proline=0;
$count_of_serine=0;
$count_of_threonine=0;
$count_of_tryptophan=0;
$count_of_tyrosine=0;
$count_of_valine=0;

enter code here

# input file query
print "\nEnter the input file: ";
$inputFile = <STDIN>;
chomp $inputFile;

unless (open(INPUTFILE, $inputFile)) {
  print "Cannot read from '$inputFile'.\nProgram closing.\n";
  <STDIN>;
  exit;
}

# load the file into an array
chomp(@dataArray = <INPUTFILE>);

# close the file
close(INPUTFILE);

for ($line = 0; $line < scalar @dataArray; $line++) {
  if ($dataArray[$line]=~/^HEADER\s+(.*?)$/) {
     $header = $1;  
     print "$header\n";
  }
  if ($dataArray[$line]=~/^TITLE\s+(.*?)$/) {
     $parsing{$line} = $1;
     print "$parsing{$line}\n";
  }
  if ($dataArray[$line] =~ m/ATOM\s+\d+\s+(\w+)\s+(\w{3})\s+.+\s+(\S+\.\S+)\s+(\S+\.\S+)\s+(\S+\.\S+)\s+.+\..+\..+/ig) {
  if ($1 eq "N" || $1 eq "CA" || $1 eq "C") {
    $parsedData{$line} = $2."\t\t".$3."\t\t".$4."\t\t".$5;

  if ($2 eq "ALA") {
    $count_of_alanine++;
  }
  if ($2 eq "ARG") {
    $count_of_arginine++;
  }
  if ($2 eq "ASN") {
    $count_of_asparagine++;
  }
  if ($2 eq "ASP") {
    $count_of_aspartic_acid++;
  }
  if($2 eq "CYS"){
    $count_of_cysteine++;
  }
  if ($2 eq "GLU") {
    $count_of_glutamic_acid++;
  }
  if ($2 eq "GLN") {
    $count_of_glutamine++;
  }
  if ($2 eq "GLY") {
    $count_of_glycine++;
  }
  if ($2 eq "HIS") {
    $count_of_histidine++;
  }
  if ($2 eq "ILE") {
    $count_of_isoleucine++;
  }
  if ($2 eq "LEU") {
    $count_of_leucine++;
  }
  if ($2 eq "LYS") {
    $count_of_lysine++;
  }
  if ($2 eq "MET") {
    $count_of_methionine++;
  }
  if ($2 eq "PHE") {
    $count_of_phenylalanine++;
  }
  if ($2 eq "PRO") {
    $count_of_proline++;
  }
  if ($2 eq "SER") {
    $count_of_serine++;
  }
  if ($2 eq "THR") {
    $count_of_threonine++;
  }
  if ($2 eq "TRP") {
    $count_of_tryptophan++;
  }
  if ($2 eq "TYR") {
    $count_of_tyrosine++;
  }
  if ($2 eq "VAL") {
    $count_of_valine++;
  }
}
  }
}

# create the output file name
$outputFile = "coordinates_".$inputFile;

# open the output file
open (OUTFILE, ">$outputFile");

# print the data lines
print OUTFILE $header, "\n";
foreach $line (sort {$a <=> $b} keys %parsing) {
  print OUTFILE $parsing{$line}."\n";
}
print OUTFILE $title, "\n";
print OUTFILE "\n";
print OUTFILE "Amino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData) {
  print OUTFILE $parsedData{$line}."\n";
}
print OUTFILE "\n"; 
print OUTFILE "Amino acid\tTotal Number(N)\tPercentage(%)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine++, "\t\t", (($count_of_alanine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Arginine\t", $count_of_arginine++, "\t\t", (($count_of_arginine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Asparagine\t", $count_of_asparagine++, "\t\t", (($count_of_asparagine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid++, "\t\t", (($count_of_aspartic_acid++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Cysteine\t", $count_of_cysteine++, "\t\t", (($count_of_cysteine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid++, "\t\t", (($count_of_glutamic_acid++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glutamine\t",$count_of_glutamine++, "\t\t", (($count_of_glutamine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glycine\t\t",$count_of_glycine++, "\t\t", (($count_of_glycine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Histidine\t",$count_of_histidine++, "\t\t", (($count_of_histidine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine++, "\t\t", (($count_of_isoleucine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Leucine\t\t",$count_of_leucine++, "\t\t", (($count_of_leucine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Lysine\t\t",$count_of_lysine++, "\t\t", (($count_of_lysine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Methionine\t",$count_of_methionine++, "\t\t", (($count_of_methionine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine++, "\t\t", (($count_of_phenylalanine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Proline\t\t",$count_of_proline++, "\t\t", (($count_of_proline++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Serine\t\t",$count_of_serine++, "\t\t", (($count_of_serine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Threonine\t",$count_of_threonine++, "\t\t", (($count_of_threonine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan++, "\t\t", (($count_of_tryptophan++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine++, "\t\t", (($count_of_tyrosine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Valine\t\t",$count_of_valine++, "\t\t", (($count_of_valine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData), "\t\t", "100\n";
print OUTFILE "\n";
print OUTFILE "\n";


# close the output file
close (OUTFILE);

# end message
print "The coordinates of '$inputFile' were saved into '$outputFile'.\n";

# end the program
exit;       

以下是数据样本

ATOM 1 N HIS A 14 -18.662 -3.949 15.643 1.00 0.00 A N
ATOM 2 CA HIS A 14 -17.604 -3.118 14.975 1.00 0.00 A C
ATOM 3 C HIS A 14 -16.660 -2.473 15.984 1.00 0.00 A C
ATOM 4 O HIS A 14 -15.693 -1.830 15.625 1.00 0.00 A O
ATOM 5 CB HIS A 14 -18.252 -1.994 14.105 1.00 0.00 A C

2 个答案:

答案 0 :(得分:-1)

只需添加:

my @directories_to_search = ('/path/a', ...);
use File::Find;
find(\&process_files, @directories_to_search);
sub process_files { 
 # $_ is the current filename within that directory
 ... Place your single file code here ...
}

答案 1 :(得分:-1)

这是我想到的第一个解决方案。没有额外的包使用

opendir(my $dh, $some_dir) || die;

# this while loop will read all filenames one by one

while($input_file = readdir $dh) {
   next if $input_file =~ /^\.+$/; #  skip . and ..
   # HERE you have one file
   # PASTE your above code here


}
closedir $dh;