这个perl代码处理文件夹中的txt文件,它的子目录分为标题,正文和xml。
#!perl -w
use strict;
use utf8;
use File::Copy;
use File::Basename;
our @folders=();
our %errors=();
our $page_errors='';
our $folder_out='';
our $folder_in='';
our $sub_folder="";
our $dev=0;
our $anker='#a_';
our $coded_lb=0;
our $line_cnt=0;
sub get_complete_filename
{
my $return = $_[0];
$return=~m/([^\d]+)(\d+)/;
return $return if (!$1 || !$2);
my $name=$1;
my $number=$2;
open (IN,"<:encoding(utf-8)","..\\..\\complete_filenames.txt");
while (<IN>)
{
my $line=$_; chomp($line);
next if ($line eq '' || $line=~m/Datei/);
if ($line=~m/$name[a-zA-Z_-]+$number/)
{
$return = $line;
last;
}
}
close IN;
return $return;
}
sub get_files
{
my $dir = $_[0];
my $file;
opendir(DIR, $dir) || die "Unable to open $dir: $!";
my @fl = grep {!/^\.\.?$/ } readdir(DIR);
closedir(DIR);
foreach (@fl)
{
if (-d ($file = "$dir\\$_"))
{
push(@folders,$file);
get_files($file);
}
}
}
sub header
{
my $fn=$_[0];
my $folder_in=$_[1];
my $folder_out=$_[2];
if (-e ($folder_in."\\".$fn.".teih"))
{
open (IN,"<:encoding(utf-8)", $folder_in."\\".$fn.".teih");
my $input = do { local $/; <IN> };
close IN;
my @lines=split(/[\n\r]/,$input);
my $read=0;
my $output="";
foreach my $line (@lines)
{
chomp($line);
if($line=~m/<\/teiHeader>/)
{
$read=0;
$output.=$line."\n";
last;
}
elsif ($read eq 1 || $line=~m/<teiHeader>/)
{
$read=1;
$output.=$line."\n";
}
}
open (OUT,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".teih");
print OUT $output;
close OUT;
#copy($folder_in."\\".$fn.".teih", $folder_out."\\".$fn."\\".$fn.".teih");
}
else
{
open (H,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".teih");
print H "\n\t<!--\n\t copy of the main_header or empty header!!!\n\t please update its content\n\t-->\n".
'<teiHeader>
<fileDesc>
<titleStmt>
<title/>
<respStmt>
<resp/>
<name/>
</respStmt>
</titleStmt>
<publicationStmt>
<distributor/>
</publicationStmt>
<sourceDesc>
<bibl/>
</sourceDesc>
</fileDesc>
</teiHeader>';
close H;
}
return '<?xml version="1.0" encoding="utf-8"?>';
}
sub check_linebreak
{
my $line=$_[0]; my $pg=$_[1];
my $ret="";
$line_cnt++;
if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+)\/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+[[:punct:]]*)/)
{
my $tmp1=$1; my $tmp2=$2;
my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
$line=~s/\Q$tmp1\E\/\Q$tmp2\E/$tmp1$z$tmp2/; $line.=" ";
if ($coded_lb eq 0)
{
$ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
}
else
{
$ret=$pg.$line;
}
$coded_lb=1;
}
else
{
if ($coded_lb eq 0)
{
$ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
}
else
{
$ret=$pg.$line;
}
$coded_lb=0;
}
return $ret;
}
sub anfangs_verarbeitung
{
my $tmp=$_[0];
$tmp =~ s/^\x{FEFF}//; # removes BOM
$tmp =~ s/#(?:(?:\r\n)|\n|\r)+(\-{2,})/#$1\n/mg;
$tmp =~ s/^p\s*$//g;
$tmp =~ s/^\s*([pP]\d+)\s*([cC]\s*[0-9IVX]+)/$1\n$2/g;
$tmp =~ s/(?<=#)\|(?=[pppctPCT])//g;
$tmp =~ s/\|(?=[pppctPCT])/#/g;
$tmp =~ s/(?<![\|#])([pppcPC]\s*[\dIVXMC]+)+/#$1/g;
$tmp =~ s/\|(\d+)/#p$1/g;
$tmp =~ s/«(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
$tmp =~ s/»(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
$tmp =~ s/<<(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
$tmp =~ s/>>(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
$tmp =~ s/(?:„|,,|")([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)«/»$1«/g;
$tmp =~ s/»([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)[“"']/»$1«/g;
$tmp =~ s/^(\d{4}\s*წ\.)\s*$/#d $1/g;
$tmp =~ s/<pol>/<pol>/g;
$tmp =~ s/<ა>/<a>/g;
$tmp =~ s/<\?([athzee])>/<\/$1>/g; #<?a>
$tmp =~ s/[<>]\/([athzee])(?![<>])/<\/$1>/g; #</a >/a
$tmp =~ s/<([athzee])\/>/<\/$1>/g; #<a/>
$tmp =~ s/[<>]\/([athzee])[<>]/<\/$1>/g; #>/a> etc.
$tmp =~ s/[<>]([athzee])[<>]/<$1>/g; #>a< etc.
$tmp =~ s/<([athzee])(?![<>])/<$1>/g; #<a >a
$tmp =~ s/(?<=[^><\/#])([athzee])[<>]/<$1>/g; #a< a>
$tmp =~ s/(?<=[^><])\/([athzee])[<>]/<\/$1>/g; #/a< /a>
#$tmp =~ s/<\/([athze])>([^<]+)<\/[^\1]>/<$1>$2<\/$1>/gm;
##$tmp =~s/<([athz])>([^<]+)<\/[^(?:$1)]>/<$1>$2<\/$1>/g;
##$tmp =~s/<([athz])>([^<]+)<[^(?:\1)]>/<$1>$2<\/$1>/g;
$tmp =~ s/<([pol])>([^<]+)<\/\1>-<\1>([^<]+)<\/\1>/<$1>$2-$3<\/$1>/g;
#$tmp =~ s/<([athze])>([^<]+)<\/\1>[\-\-]<([athze])>([^<]+)<\/\3>/<$1>$2-$4<\/$1>/g;
##$tmp =~ s/([^\s]+)\-<([athz])>([^<]+)<\/\2>/<$2>$1\-$3<\/$2>/g;
##$tmp =~ s/<name([^>]+)>([^<]+)<\/name>//g;
$tmp =~ s/<\/</</g;
####$tmp =~ s/<\/(?![athzee])//g;
$tmp =~ s/#{2,}/#/g;
$tmp =~ s/\(\/\/\?([^\)]*)\)/<unclear>$1<\/unclear>/gm;
$tmp =~ s/<unclear><\/unclear>/<unclear\/>/g;
$tmp =~ s/\(\/\/([^\)]+)\)/<corr>$1<\/corr>/gm;
#$tmp =~ s/<s(\d+)>([^<]+)<\/s\1>/$2<ref target="#a$1" type="noteAnchor">$1<\/ref>/gm;
$tmp =~ s/<[sS](\d+)>/<ref target="a$1" type="noteAnchor">/g;
$tmp =~ s/<\/[sS]\d+>/<\/ref>/g;
$tmp =~ s/\([sS](\d+)=?\s*([^\)]+)\)/<note xml:id="a$1" type="footnote">$2<\/note>\n/gm;
#$tmp =~ s/#f(\d+)\s*(.*)([^#\|]+)/<note xml:id="a$1" type="footnote">$2<\/note>\n\n/gm;
#$tmp =~ s/\(s\s*(\d+)\s*([^\)]+)\)/<note xml:id="a$1" type="footnote">$2<\/note>\n\n/gm;
$tmp =~ s/\n{1,}<\/note>/<\/note>/gm;
#$tmp =~ s/\s*#\-{2,}//gm;
$tmp=~s/ვი\$/ჳ/g;
$tmp=~s/ხ\$/ჴ/g;
$tmp=~s/ე\$/ჱ/g;
$tmp=~s/ი\$/ჲ/g;
$tmp=~s/ფ\$/ჶ/g;
$tmp=~s/ვ\$/უ/g;
$tmp=~s/ო\$/ჵ/g;
$tmp=~s/#\.{2,}/#\-\-\-\-\-\-\-\-\-\-\-\-\-\-/g;
return $tmp;
}
sub end_verarbeitung
{
my $tmp=$_[0];
$tmp =~ s/[\n\r]{2,}/\n/g;
$tmp =~ s/<p>\s+/<p>/g;
$tmp =~ s/<\/p>\s+/<\/p>/g;
$tmp =~ s/<p><\/p>//g;
$tmp =~ s/<div><p><div type="dateline">/<div type="dateline">/g;
$tmp =~ s/<p><div type="dateline">/<div type="dateline">/g;
$tmp =~ s/<pol>([^<]+)<\/pol>/<term type="political">$1<\/term>/g;
$tmp =~ s/<term type="political"> ([^<]+)<\/name>/ <term type="political">$1<\/term>/g;
$tmp =~ s/<a><name/<name/g;
$tmp =~ s/<t>([^<]+)<\/t>/<name type="toponym">$1<\/name>/g;
$tmp =~ s/<z>([^<]+)<\/z>/<name type="zoonym">$1<\/name>/g;
$tmp =~ s/<h>([^<]+)<\/h>/<name type="hydronym">$1<\/name>/g;
$tmp =~ s/<e>([^<]+)<\/e>/<name type="ethnonym">$1<\/name>/g;
#$tmp =~ s/<a>([^<]+)/<name type="anthroponym">$1<\/name>/g;
#$tmp =~ s/([^>]+)<\/a>/<name type="anthroponym">$1<\/name>/g;
$tmp =~ s/<u>([^<]+)<?\/u>/<name type="unknown">$1<\/name>/g;
$tmp =~ s/\s+([\.:,!\?\)])/$1/g;
$tmp =~ s/(\()\s+/$1/g;
$tmp=~s/<p>#<\/p>//g;
$tmp=~s/<div><\/div>//g;
$tmp=~s/\.\s+\./\.\./g;
$tmp=~s/\.\.(?!<\.)/\.\.\./g;
$tmp=~s/\.\.\./…/g;
$tmp=~s/…\s*\./…/g;
$tmp=~s/ +([,\.…;:!\?])/$1/g;
#$tmp=~s/([,\.…;:!\?])(?!< )/$1 /g;
$tmp=~s/-/–/g;
$tmp=~s/,–/, –/g;
$tmp=~s/([\.:,!\?\)])–/$1 -/g;
$tmp=~s/\. </\.</g;
$tmp=~s/xml: id/xml:id/g;
$tmp=~s/#-{2,}//g;
$tmp=~s/<p><\/p>//g;
$tmp=~s/\s*<\/p><p>/<\/p>\n\t\t\t\t<p>/g;
$tmp=~s/ +/ /g;
#$tmp =~ s/„([^„“]+)„/„$1“/g;
#$tmp=~s/<pb n="(\d+)"\/>(?:\r\n)*n*\s*<\/div>/<pb n="$1"\/>/gm;
#$tmp=~s/<div type="Section">(?:\r\n)*\n*\s*<head>([^<]+)<\/head>/<div type="Section">\n\t\t\t\t<head>$1<\/head>\n\t\t\t\t<\/div>/gm;
#$tmp=~s/\s*<pb n="(\d+)"\/>(?:\r\n)*\n*\s*<div type="Section">/<div type="Section">\n\t\t\t\t<pb n="$1"\/>/gm;
$tmp=~s/<\/p><lg>/<\/p>\n\t\t\t\t<lg>/g;
$tmp=~s/<\/p><\/div>/<\/p>\n\t\t\t\<\/div>/g;
$tmp=~s/(<name[^>]*>) +/ $1/g;
$tmp=~s/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*) +<\/name>/<\/name>$1 /g;
$tmp=~s/…\s*<\/name>/<\/name>…/g;
$tmp=~s/,\s*\./\./g;
$tmp=~s/ +/ /g;
$tmp=~s/NORMALERZBNORMALERZB(\d+)NORMALERZBNORMALERZB/\n\t\t\t\t\t<lb n="$1"\/> /g;
$tmp=~s/TRENNENDERZBTRENNENDERZB(\d+)TRENNENDERZBTRENNENDERZB/<lb n="$1"\/>/g;
$tmp=~s/PAGE PAGE PAGE PAGE PAGE(\d+)PAGE PAGE PAGE PAGE PAGE/<pb n="$1"\/>/g; #//<pb n=\"".$current_page."\"/>";
$tmp=~s/<\/p>(<pb n="\d+"\/>)/<\/p>\n\t\t\t\t$1/g;
$tmp=~s/ (<pb n="\d+"\/>)/$1/g;
$tmp=~s/<\/p>[\r\n]+\s+<p>(<pb n="\d+"\/>)<\/p>/$1<\/p>/g;
$tmp=~s/<\/l>(<pb n="\d+"\/>)/$1<\/l>/g;
$tmp=~s/ +/ /g;
$tmp=~s/<a><name/<name/g;
$tmp=~s/<head><\/head>//;
my $sperr="";
if ($_[1]!~m/(?:04|07|11).1857/ && $_[1]!~m/(?:04|08).1858/)
{
while ($tmp=~m/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])((?:[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ][^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ~\–\-]){3,})/)
{
my $sperr_org=$1;
my $sperr_edit=$1;
my $rest="";
$sperr_edit=~s/ //g;
$sperr.=$sperr_edit."\n";
if ($sperr_edit=~m/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)$/)
{
$rest=($1 ne "<"?" ":"").$1;
$sperr_edit=~s/[^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+$//;
}
else { $rest=" "; }
$tmp=~s/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])\Q$sperr_org\E/<hi rend="letter-spacing">$sperr_edit<\/hi>$rest/;
}
if ($sperr ne "")
{
open (OUT,">>:encoding(utf-8)", $_[2]."\\000_sperrschrift.txt");
print OUT $_[1]."\n\t".$sperr;
close OUT;
}
}
$tmp=~s/>([^<])<\/name>\./>$1\.<\/name>/g;
return $tmp;
}
sub go_go_gadget
{
my $file_xml=$_[0];
my $file_html=''; my $output=''; my $output_filename='';
my $chapter=0; my $div=0; my $p=0; my $last_p=0; my $v=0;
my $input_xml=''; my $chapter_type=''; my $written=0;
my $page=0; my $started=0;
(my $fn,my $pn)=fileparse $file_xml;
return if ($fn=~m/instruqcia/);
print "\tkonvertiere $fn\n";
$fn=~s/\.txt//g;
$fn=~s/(\d+)_/$1+/;
$fn=~s/_/-/g; $fn=~s/^([a-z]+)-/$1_/g;
$sub_folder="";
if ($fn=~m/^([^_]+_[a-zA-Z]+)/)
{
$sub_folder=$1;
}
$file_xml=~s/(?:\/|\\+)/\\/g;
open (IN,"<:encoding(utf-8)", $file_xml) || die "konnte die datei nicht oeffnen: $!\n";
$input_xml = do { local $/; <IN> } ; # Eingabedatei komplett in String einlesen
close IN;
# -----------------------------------------
$input_xml=anfangs_verarbeitung($input_xml);
# -----------------------------------------
$div=0;
my $last_line='';
my @lines=split(/\n/,$input_xml);
$line_cnt=0;
my $group_cnt=0;
my $verse_cnt=0;
my $pg='';
my $first_page=0;
my $last_page=0;
my $has_chapters=0;
my $ut=0;
my $quote_open=0;
my $section_cnt=0;
my $chapter_cnt=0;
$coded_lb=0;
$chapter_type="Section";
$has_chapters=1 if ($input_xml=~m/#\s*[cC]\s*[\dIVXMC]+[\–\-]?[\dIVXMC]*/);
if ($has_chapters eq 0)
{
$output='<div type="Section">' ;
#$chapter=1;
}
$output='<div type="Content" n="1">'."\n";
foreach my $line (@lines)
{
$line=~s/^L\s*//;
chomp($line); $line=~s/\n//g; $line=~s/\r//g; $line=~s/(\s){2,}/$1/g; $line =~ s/^\s+//g; $line =~ s/\s+$//g;
#$line=~s/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])([,;\.])([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/$1$2 $3/g;
if ($has_chapters eq 1 && $line =~ m/^\s*#?\s*\|?[cC]\s*(\d+)\s*(.*)/) # chapter
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
if ($started eq 1)
{
if ($div eq 1) { $output.="\n\t\t\t</div>"; }
elsif ($chapter eq 1) { $output.="\n\t\t\t</div>"; }
}
my $title=$2;
if ($title) { $title=~s/<ref target="#a(\d+)" type="noteAnchor">/<ref target="#a_$page\_$1" type="noteAnchor">/g; }
$chapter_cnt=$1;
$output.="\n\t\t\t".'<div type="Chapter" n="'.$chapter_cnt.'">'."\n\t\t\t\t<head>".($title?check_linebreak($title,$pg):$pg)."</head>";
$chapter=1; $chapter_type="Chapter"; #$div=0;
$p=0; $written=0; $v=0; $ut=0;
$last_line=""; $started=0;
$pg='';
}
elsif ($line =~ m/^\s*#\s*[pP]\s*(\d+)/) # page break
{
if ($v ne 1 && $p eq 0)
{
if ($div == 0)
{
if ($chapter_type eq "Section" || $has_chapters == 0) { $section_cnt++; $output.='<div type="Section" n="'.$section_cnt.'">'; }
else { $output.='<div type="Chapter" n="'.$chapter_cnt.'">'; }
$div=1;
}
$output.="<p>"; $p=1;
}
#$output.="<pb n=\"".$1."\"/>";
# --- detecting page errors
my $current_page=$1; #0;
#if ($first_page > 0)
#{
# $current_page=$1;
# if ($current_page-$last_page<1)
# {
# $current_page=$last_page+1;
# $page_errors.=$fn."\t".$last_page."\n";
# }
# elsif ($current_page-$last_page>1)
# {
# $page_errors.=$fn."\t".$last_page."\n";
# }
#}
#else
#{
# $first_page=$1;
# $current_page=$1;
#}
#$last_page=$current_page;
# ----
$pg.="PAGE PAGE PAGE PAGE PAGE".$current_page."PAGE PAGE PAGE PAGE PAGE";
#$p=0;
$page=$1;
#$written=0;
$last_line="";
$line_cnt=0;
}
elsif ($line =~ m/\s*#[tT]\s*(.+)/) # title
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
if (($chapter eq 1 || $div eq 1) && $chapter_type ne 'chapter')
{
# if($chapter_type eq 'chapter')
# {
# if ($started eq 1)
# { $output.="\n\t\t\t</div>\n\t\t\t".'<div type="Chapter" n="'.$1.'">';}
# else { $output.='<div type="Chapter" n="'.$1.'">';}
# }
# else
# {
if ($started eq 1) { $section_cnt++; $output.="\n\t\t\t</div>\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">'; }
else { $section_cnt++; $output.="\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">'; }
# }
}
else
{
#$section_cnt++;
#$output.='<div type="Section" n="'.$section_cnt.'">';
#$div=1;
}
#$line_cnt++;
$output.="\n\t\t\t\t<head>".$pg.$1."</head>";
$pg='';
$p=0; $written=0; $v=0; $ut=0;
$last_line=''; $started=1;
}
elsif ($line =~ m/#v\s*(.+)/) # verse
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>" ;}
if ($v eq 0) { $group_cnt++; $verse_cnt=0; $output.="\n\t\t\t\t".'<lg n="'.$group_cnt.'">'; }
$verse_cnt++;
$last_line=$1;
$line_cnt++;
$output.="\n\t\t\t\t\t".'<l n="'.$verse_cnt.'">'.$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</l>";
$p=0;
$written=1; $v=1;
$started=1; $pg=''; $ut=0;
}
elsif ($line=~m/#\s*\-{2,}/) #elsif ($line eq '' && $last_line ne '') # && $last_line!~m/[\.!\?]\s*$/) # paragraph
{
if ($written eq 1)
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; $p=0; }
elsif ($v eq 1 && $written eq 0) { $output.="\n\t\t\t</lg>\n\t\t\t\t"; $v=0; }
}
#if ($p eq 0 && $v eq 0) { $output.="\n\t\t\t\t<p>"; $p=1; }
$written=0; $last_line=''; $ut=0;
}
elsif ($line =~ m/^(?:#d)?\s*(\d{4}\s*წ\.)$/ || $line=~m/^\s*(\d{4}(?: – \d+\s*წ*\.)?\s*)$/ || $line=~m/^\s*([0-9]+\s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+\s*[0-9]+\s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*)$/) # dateline
{
if ($div eq 1 || $chapter eq 1)
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
$output.="\n\t\t\t</div>";
$chapter=0; $div=0; $ut=0;
}
$line_cnt++;
$output.="\n\t\t\t\t<div type=\"dateline\"><p>".$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</p></div>";
$p=0; $written=0; $v=0; $pg='';
$last_line="";
}
elsif ($div eq 1 || $chapter eq 1 || ($div eq 0 && $chapter eq 0))
{
if ($line!~m/^\s*$/)
{
$output.="\n\t\t\t\t</lg>" if ($v eq 1);
if ($div eq 0 && $chapter eq 0) { $div=1; $section_cnt++; $output.="\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">';}
if ($p eq 0) { $output.="\n\t\t\t\t<p>"; }
$line=~s/\s*#\s*//g;
# --- quotes
$line=~s/([\.,;\?!:])„/$1“/g; #„ “
if ($line=~m/^\s*„/ && $line!~m/“/ && $line=~m/[\.\?!:]+\s*$/)
{
$line.="“";
}
else
{
$line=~s/„//g;
}
$line=~s/“//g if ($line=~m/“/ && $line!~m/„/);
# ---
# --- ref
$line=~s/<ref target="a(\d+)" type="noteAnchor">/<ref target="#a_$page\_$1" type="noteAnchor">/g;
$line=~s/<note xml:id="a(\d+)" type="footnote">/<note xml:id="a_$page\_$1" type="footnote">/g;
# ---
if ($line=~m/\|ut/)
{
$line=~s/\|ut/<note type="comment">/;
$ut=1;
}
$output.=check_linebreak($line,$pg);
#$line_cnt++;
#if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+)\/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+[[:punct:]]*)/)
#{
# my $tmp1=$1; my $tmp2=$2;
# my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
# $line=~s/\Q$tmp1\E\/\Q$tmp2\E/$tmp1$z$tmp2/; $line.=" ";
# if ($coded_lb eq 0)
# {
# $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
# }
# else
# {
# $output.=$pg.$line;
# }
# $coded_lb=1;
#}
#else
#{
# if ($coded_lb eq 0)
# {
# $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
# }
# else
# {
# $output.=$pg.$line;
# }
# $coded_lb=0;
#}
#$output.=$pg." "."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$line;
$last_line=$line;
$p=1; $written=1; $v=0; $started=1;$pg='';
}
}
}
if ($p eq 1) { $output.="</p>";}
elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
if ($div eq 1) { $output.="\n\t\t\t</div>"; }
elsif ($chapter eq 1) { $output.="\n\t\t\t</div>"; }
# -----------------------------------------
$output=end_verarbeitung($output,$fn,$folder_out)."</div>";
# -----------------------------------------
#$fn=get_complete_filename($fn);
mkdir($folder_out."\\".$sub_folder,0777) if ($dev eq 0 && !(-d $folder_out."\\".$sub_folder));
my $txt='<text rend="Section" xml:lang="kat">';
$txt='<text rend="'.($section_cnt?"Section ":"").'Chapter" xml:lang="kat">' if ($has_chapters eq 1);
$output=header($fn,$folder_in,$folder_out)."\n\t".$txt.'
<body>
'.$output.'
</body>
</text>';
$output=~s/(<body>(?:\r\n)*\s*<pb n="\d+"\/>)(?:\r\n)*\s*<\/div>/$1/g;
$output_filename=$folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".xml";
open (OUT, ">:encoding(utf-8)", $output_filename);
print OUT '<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE TEI [
<!ENTITY header SYSTEM "'.$fn.'.teih">
<!ENTITY text SYSTEM "'.$fn.'.txml">
]>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:gnc="http://iness.uib.no/ns/1.0">
&header;
&text;
</TEI>';
close (OUT);
$output_filename=~s/\.xml/\.txml/gi;
open (OUT, ">:encoding(utf-8)",$output_filename) || die "konnte die ausgabedatei \"$output_filename\" nicht oeffnen: $!\n";
print OUT $output;
close OUT;
}
sub main
{
print "\nBeginne...\n";
my $root="D:\\bla";
my @startfolders=($root."\\");
$folder_out="D:\\bla";
foreach my $startfolder(@startfolders)
{
@folders=();
get_files($startfolder);
if (scalar(@folders)<1) { push(@folders,$startfolder); }
$root=~s/0_Eingabe/1_Ausgabe\\1/;
foreach $folder_in(@folders)
{
$page_errors='';
$folder_out=$folder_in;
$folder_out=~s/0_Eingabe/1_Ausgabe/;
# creating subfolders too
#my $tmp=$folder_out;
#$tmp=~s/\Q$root\E//;
#my @arr_tmp=split("\\\\",$tmp);
#$tmp="";
#foreach my $dings (@arr_tmp)
#{
# next if ($dings eq '');
# $tmp.="\\".$dings;
# mkdir($root.$tmp,0777) if (!(-d $root.$tmp));
#}#
# -----
$folder_out=~s/\\+/\\/g;
$dev = 1; # entwicklermodus an bei 1
$folder_out=~s/1_Ausgabe.*/1_Ausgabe/ if ($dev eq 1);
print "Ordner ".$folder_in."\n";
foreach my $file_xml(<${folder_in}/*.txt>)
{
go_go_gadget($file_xml);
}
next;
if ($page_errors ne '')
{
$folder_in=~m/0_Eingabe\\(.+)/;
my $tmp=$1;
$tmp=~s/\\+/__/g;
open (OUT, ">:encoding(utf-8)", $root."\\".$tmp.".txt") || die "\n\tPage errors to file ".$tmp.": ".$!."\n";
print OUT $page_errors;
close OUT;
}
}
}
print "Fertig!\n\n";
}
main();
但是有些文件处理时间太长。 如果超过6秒,我想跳过其中任何一个步骤。 这样,如果处理的文件转换时间太长,它会跳到下一个文件。 有关如何通过超时执行此操作的任何建议?
答案 0 :(得分:2)
我没有仔细检查你的代码,以告诉你超时代码的确切位置,但你应该能够使用Time::Out轻松完成你想要的任务。只需使用
use Time::Out 'timeout';
timeout 6 => sub {
# code that you want to time out after 6 seconds goes here
}
你应该被设置。