我被赋予了从pdb文件中提取数据以进行分析的任务。数据包括标题,标题,x,y,z坐标和有关氨基酸的信息。
我已经编写了一个程序,可以从pdb文件中读取并传输新输出文件所需的信息。我的主要问题是我需要一个脚本,可以读取目录中的500多个pdb文件并收集所需的信息。
这是我原始代码的一部分。它从单个用户输入的pdb文件中提取信息。我希望为目录中的多个文件实现相同的功能。
#count for all amino acids existing in the protein
$count_of_alanine=0;
$count_of_arginine=0;
$count_of_asparagine=0;
$count_of_aspartic_acid=0;
$count_of_cysteine=0;
$count_of_glutamic_acid=0;
$count_of_glutamine=0;
$count_of_glycine=0;
$count_of_histidine=0;
$count_of_isoleucine=0;
$count_of_leucine=0;
$count_of_lysine=0;
$count_of_methionine=0;
$count_of_phenylalanine=0;
$count_of_proline=0;
$count_of_serine=0;
$count_of_threonine=0;
$count_of_tryptophan=0;
$count_of_tyrosine=0;
$count_of_valine=0;
enter code here
# input file query
print "\nEnter the input file: ";
$inputFile = <STDIN>;
chomp $inputFile;
unless (open(INPUTFILE, $inputFile)) {
print "Cannot read from '$inputFile'.\nProgram closing.\n";
<STDIN>;
exit;
}
# load the file into an array
chomp(@dataArray = <INPUTFILE>);
# close the file
close(INPUTFILE);
for ($line = 0; $line < scalar @dataArray; $line++) {
if ($dataArray[$line]=~/^HEADER\s+(.*?)$/) {
$header = $1;
print "$header\n";
}
if ($dataArray[$line]=~/^TITLE\s+(.*?)$/) {
$parsing{$line} = $1;
print "$parsing{$line}\n";
}
if ($dataArray[$line] =~ m/ATOM\s+\d+\s+(\w+)\s+(\w{3})\s+.+\s+(\S+\.\S+)\s+(\S+\.\S+)\s+(\S+\.\S+)\s+.+\..+\..+/ig) {
if ($1 eq "N" || $1 eq "CA" || $1 eq "C") {
$parsedData{$line} = $2."\t\t".$3."\t\t".$4."\t\t".$5;
if ($2 eq "ALA") {
$count_of_alanine++;
}
if ($2 eq "ARG") {
$count_of_arginine++;
}
if ($2 eq "ASN") {
$count_of_asparagine++;
}
if ($2 eq "ASP") {
$count_of_aspartic_acid++;
}
if($2 eq "CYS"){
$count_of_cysteine++;
}
if ($2 eq "GLU") {
$count_of_glutamic_acid++;
}
if ($2 eq "GLN") {
$count_of_glutamine++;
}
if ($2 eq "GLY") {
$count_of_glycine++;
}
if ($2 eq "HIS") {
$count_of_histidine++;
}
if ($2 eq "ILE") {
$count_of_isoleucine++;
}
if ($2 eq "LEU") {
$count_of_leucine++;
}
if ($2 eq "LYS") {
$count_of_lysine++;
}
if ($2 eq "MET") {
$count_of_methionine++;
}
if ($2 eq "PHE") {
$count_of_phenylalanine++;
}
if ($2 eq "PRO") {
$count_of_proline++;
}
if ($2 eq "SER") {
$count_of_serine++;
}
if ($2 eq "THR") {
$count_of_threonine++;
}
if ($2 eq "TRP") {
$count_of_tryptophan++;
}
if ($2 eq "TYR") {
$count_of_tyrosine++;
}
if ($2 eq "VAL") {
$count_of_valine++;
}
}
}
}
# create the output file name
$outputFile = "coordinates_".$inputFile;
# open the output file
open (OUTFILE, ">$outputFile");
# print the data lines
print OUTFILE $header, "\n";
foreach $line (sort {$a <=> $b} keys %parsing) {
print OUTFILE $parsing{$line}."\n";
}
print OUTFILE $title, "\n";
print OUTFILE "\n";
print OUTFILE "Amino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData) {
print OUTFILE $parsedData{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "Amino acid\tTotal Number(N)\tPercentage(%)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine++, "\t\t", (($count_of_alanine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Arginine\t", $count_of_arginine++, "\t\t", (($count_of_arginine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Asparagine\t", $count_of_asparagine++, "\t\t", (($count_of_asparagine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid++, "\t\t", (($count_of_aspartic_acid++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Cysteine\t", $count_of_cysteine++, "\t\t", (($count_of_cysteine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid++, "\t\t", (($count_of_glutamic_acid++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glutamine\t",$count_of_glutamine++, "\t\t", (($count_of_glutamine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Glycine\t\t",$count_of_glycine++, "\t\t", (($count_of_glycine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Histidine\t",$count_of_histidine++, "\t\t", (($count_of_histidine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine++, "\t\t", (($count_of_isoleucine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Leucine\t\t",$count_of_leucine++, "\t\t", (($count_of_leucine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Lysine\t\t",$count_of_lysine++, "\t\t", (($count_of_lysine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Methionine\t",$count_of_methionine++, "\t\t", (($count_of_methionine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine++, "\t\t", (($count_of_phenylalanine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Proline\t\t",$count_of_proline++, "\t\t", (($count_of_proline++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Serine\t\t",$count_of_serine++, "\t\t", (($count_of_serine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Threonine\t",$count_of_threonine++, "\t\t", (($count_of_threonine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan++, "\t\t", (($count_of_tryptophan++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine++, "\t\t", (($count_of_tyrosine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "Valine\t\t",$count_of_valine++, "\t\t", (($count_of_valine++)/(scalar(keys %parsedData))*100), "\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData), "\t\t", "100\n";
print OUTFILE "\n";
print OUTFILE "\n";
# close the output file
close (OUTFILE);
# end message
print "The coordinates of '$inputFile' were saved into '$outputFile'.\n";
# end the program
exit;
以下是数据样本
ATOM 1 N HIS A 14 -18.662 -3.949 15.643 1.00 0.00 A N
ATOM 2 CA HIS A 14 -17.604 -3.118 14.975 1.00 0.00 A C
ATOM 3 C HIS A 14 -16.660 -2.473 15.984 1.00 0.00 A C
ATOM 4 O HIS A 14 -15.693 -1.830 15.625 1.00 0.00 A O
ATOM 5 CB HIS A 14 -18.252 -1.994 14.105 1.00 0.00 A C
答案 0 :(得分:-1)
只需添加:
my @directories_to_search = ('/path/a', ...);
use File::Find;
find(\&process_files, @directories_to_search);
sub process_files {
# $_ is the current filename within that directory
... Place your single file code here ...
}
答案 1 :(得分:-1)
这是我想到的第一个解决方案。没有额外的包使用
opendir(my $dh, $some_dir) || die;
# this while loop will read all filenames one by one
while($input_file = readdir $dh) {
next if $input_file =~ /^\.+$/; # skip . and ..
# HERE you have one file
# PASTE your above code here
}
closedir $dh;