Question

这个perl代码处理文件夹中的txt文件，它的子目录分为标题，正文和xml。

    #!perl -w

use strict;
use utf8;
use File::Copy;
use File::Basename;

our @folders=();
our %errors=();
our $page_errors='';
our $folder_out='';
our $folder_in='';
our $sub_folder="";
our $dev=0;
our $anker='#a_';
our $coded_lb=0;
our $line_cnt=0;

sub get_complete_filename
{
    my $return = $_[0];
    $return=~m/([^\d]+)(\d+)/;
    return $return if (!$1 || !$2);
    my $name=$1;
    my $number=$2;

    open (IN,"<:encoding(utf-8)","..\\..\\complete_filenames.txt");
    while (<IN>)
    {
        my $line=$_; chomp($line);
        next if ($line eq '' || $line=~m/Datei/);       
        if ($line=~m/$name[a-zA-Z_-]+$number/)
        {
            $return = $line;
            last;
        }       
    }   
    close IN;
    return $return;
}

sub get_files 
{   
    my $dir = $_[0];
    my $file;
    opendir(DIR, $dir) || die "Unable to open $dir: $!";
    my @fl = grep {!/^\.\.?$/ } readdir(DIR);
    closedir(DIR);
    foreach (@fl) 
    {
        if (-d ($file = "$dir\\$_")) 
        {
            push(@folders,$file); 
            get_files($file); 
        } 
    }
}

sub header
{
    my $fn=$_[0];
    my $folder_in=$_[1];
    my $folder_out=$_[2];

    if (-e ($folder_in."\\".$fn.".teih"))
    {
        open (IN,"<:encoding(utf-8)", $folder_in."\\".$fn.".teih");
        my $input = do { local $/; <IN> };
        close IN;
        my @lines=split(/[\n\r]/,$input);   
        my $read=0;
        my $output="";
        foreach my $line (@lines) 
        {
            chomp($line);
            if($line=~m/<\/teiHeader>/)
            {
                $read=0;
                $output.=$line."\n";
                last;
            }
            elsif ($read eq 1 || $line=~m/<teiHeader>/)
            {
                $read=1;
                $output.=$line."\n";
            }
        }           
        open (OUT,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".teih");
        print OUT $output;
        close OUT;
        #copy($folder_in."\\".$fn.".teih", $folder_out."\\".$fn."\\".$fn.".teih"); 
    }
    else
    {   
    open (H,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".teih");
    print H "\n\t<!--\n\t copy of the main_header or empty header!!!\n\t please update its content\n\t-->\n".
'<teiHeader>
    <fileDesc>
        <titleStmt>
            <title/>
            <respStmt>
                <resp/>
                <name/>
            </respStmt>
        </titleStmt>
        <publicationStmt>
            <distributor/>
        </publicationStmt>
        <sourceDesc>
            <bibl/>
        </sourceDesc>
    </fileDesc>
</teiHeader>';
    close H;
    }

    return '<?xml version="1.0" encoding="utf-8"?>'; 
}

sub check_linebreak
{
    my $line=$_[0]; my $pg=$_[1];
    my $ret="";

    $line_cnt++;
    if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+)\/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+[[:punct:]]*)/)
    {                   
        my $tmp1=$1; my $tmp2=$2;
        my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
        $line=~s/\Q$tmp1\E\/\Q$tmp2\E/$tmp1$z$tmp2/; $line.=" ";
        if ($coded_lb eq 0)
        {
            $ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
        }
        else
        {
            $ret=$pg.$line; 
        }
        $coded_lb=1;        
    }
    else
    {
        if ($coded_lb eq 0)
        {
            $ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
        }
        else
        {
            $ret=$pg.$line;                     
        }
        $coded_lb=0;
    }
    return $ret;
}

sub anfangs_verarbeitung
{
    my $tmp=$_[0];

    $tmp =~ s/^\x{FEFF}//;  # removes BOM


    $tmp =~ s/#(?:(?:\r\n)|\n|\r)+(\-{2,})/#$1\n/mg;
    $tmp =~ s/^p\s*$//g;

    $tmp =~ s/^\s*([pP]\d+)\s*([cC]\s*[0-9IVX]+)/$1\n$2/g;

    $tmp =~ s/(?<=#)\|(?=[pppctPCT])//g;
    $tmp =~ s/\|(?=[pppctPCT])/#/g;
    $tmp =~ s/(?<![\|#])([pppcPC]\s*[\dIVXMC]+)+/#$1/g;
    $tmp =~ s/\|(\d+)/#p$1/g;

    $tmp =~ s/«(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
    $tmp =~ s/»(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
    $tmp =~ s/<<(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
    $tmp =~ s/>>(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
    $tmp =~ s/(?:„|,,|")([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)«/»$1«/g;
    $tmp =~ s/»([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)[“"']/»$1«/g;



    $tmp =~ s/^(\d{4}\s*წ\.)\s*$/#d $1/g;

    $tmp =~ s/<pol>/<pol>/g;    

    $tmp =~ s/<ა>/<a>/g;    
    $tmp =~ s/<\?([athzee])>/<\/$1>/g;                  #<?a>           
    $tmp =~ s/[<>]\/([athzee])(?![<>])/<\/$1>/g;            #</a >/a            
    $tmp =~ s/<([athzee])\/>/<\/$1>/g;                  #<a/>
    $tmp =~ s/[<>]\/([athzee])[<>]/<\/$1>/g;                #>/a> etc.          
    $tmp =~ s/[<>]([athzee])[<>]/<$1>/g;                    #>a< etc.
    $tmp =~ s/<([athzee])(?![<>])/<$1>/g;                   #<a >a          
    $tmp =~ s/(?<=[^><\/#])([athzee])[<>]/<$1>/g;           #a< a>          
    $tmp =~ s/(?<=[^><])\/([athzee])[<>]/<\/$1>/g;      #/a< /a>    

    #$tmp =~ s/<\/([athze])>([^<]+)<\/[^\1]>/<$1>$2<\/$1>/gm;

    ##$tmp =~s/<([athz])>([^<]+)<\/[^(?:$1)]>/<$1>$2<\/$1>/g;   
    ##$tmp =~s/<([athz])>([^<]+)<[^(?:\1)]>/<$1>$2<\/$1>/g;

    $tmp =~ s/<([pol])>([^<]+)<\/\1>-<\1>([^<]+)<\/\1>/<$1>$2-$3<\/$1>/g;
    #$tmp =~ s/<([athze])>([^<]+)<\/\1>[\-\-]<([athze])>([^<]+)<\/\3>/<$1>$2-$4<\/$1>/g;

    ##$tmp =~ s/([^\s]+)\-<([athz])>([^<]+)<\/\2>/<$2>$1\-$3<\/$2>/g;
    ##$tmp =~ s/<name([^>]+)>([^<]+)<\/name>//g;

    $tmp =~ s/<\/</</g;

    ####$tmp =~ s/<\/(?![athzee])//g;

    $tmp =~ s/#{2,}/#/g;

    $tmp =~ s/\(\/\/\?([^\)]*)\)/<unclear>$1<\/unclear>/gm;
    $tmp =~ s/<unclear><\/unclear>/<unclear\/>/g;
    $tmp =~ s/\(\/\/([^\)]+)\)/<corr>$1<\/corr>/gm;

    #$tmp =~ s/<s(\d+)>([^<]+)<\/s\1>/$2<ref target="#a$1" type="noteAnchor">$1<\/ref>/gm;
    $tmp =~ s/<[sS](\d+)>/<ref target="a$1" type="noteAnchor">/g;
    $tmp =~ s/<\/[sS]\d+>/<\/ref>/g;
    $tmp =~ s/\([sS](\d+)=?\s*([^\)]+)\)/<note xml:id="a$1" type="footnote">$2<\/note>\n/gm;

    #$tmp =~ s/#f(\d+)\s*(.*)([^#\|]+)/<note xml:id="a$1" type="footnote">$2<\/note>\n\n/gm;
    #$tmp =~ s/\(s\s*(\d+)\s*([^\)]+)\)/<note xml:id="a$1" type="footnote">$2<\/note>\n\n/gm;
    $tmp =~ s/\n{1,}<\/note>/<\/note>/gm;

    #$tmp =~ s/\s*#\-{2,}//gm;  

    $tmp=~s/ვი\$/ჳ/g;
    $tmp=~s/ხ\$/ჴ/g;
    $tmp=~s/ე\$/ჱ/g;
    $tmp=~s/ი\$/ჲ/g;
    $tmp=~s/ფ\$/ჶ/g;
    $tmp=~s/ვ\$/უ/g;
    $tmp=~s/ო\$/ჵ/g;

    $tmp=~s/#\.{2,}/#\-\-\-\-\-\-\-\-\-\-\-\-\-\-/g;

    return $tmp;
}

sub end_verarbeitung
{
    my $tmp=$_[0];
    $tmp =~ s/[\n\r]{2,}/\n/g;
    $tmp =~ s/<p>\s+/<p>/g;
    $tmp =~ s/<\/p>\s+/<\/p>/g;
    $tmp =~ s/<p><\/p>//g;
    $tmp =~ s/<div><p><div type="dateline">/<div type="dateline">/g;
    $tmp =~ s/<p><div type="dateline">/<div type="dateline">/g;

    $tmp =~ s/<pol>([^<]+)<\/pol>/<term type="political">$1<\/term>/g;
    $tmp =~ s/<term type="political"> ([^<]+)<\/name>/ <term type="political">$1<\/term>/g;
    $tmp =~ s/<a><name/<name/g;

    $tmp =~ s/<t>([^<]+)<\/t>/<name type="toponym">$1<\/name>/g;

    $tmp =~ s/<z>([^<]+)<\/z>/<name type="zoonym">$1<\/name>/g;         
    $tmp =~ s/<h>([^<]+)<\/h>/<name type="hydronym">$1<\/name>/g;           
    $tmp =~ s/<e>([^<]+)<\/e>/<name type="ethnonym">$1<\/name>/g;           

    #$tmp =~ s/<a>([^<]+)/<name type="anthroponym">$1<\/name>/g;
    #$tmp =~ s/([^>]+)<\/a>/<name type="anthroponym">$1<\/name>/g;

    $tmp =~ s/<u>([^<]+)<?\/u>/<name type="unknown">$1<\/name>/g;

    $tmp =~ s/\s+([\.:,!\?\)])/$1/g;
    $tmp =~ s/(\()\s+/$1/g;

    $tmp=~s/<p>#<\/p>//g;
    $tmp=~s/<div><\/div>//g;

    $tmp=~s/\.\s+\./\.\./g;
    $tmp=~s/\.\.(?!<\.)/\.\.\./g;
    $tmp=~s/\.\.\./…/g;
    $tmp=~s/…\s*\./…/g; 
    $tmp=~s/ +([,\.…;:!\?])/$1/g;
    #$tmp=~s/([,\.…;:!\?])(?!< )/$1 /g;
    $tmp=~s/-/–/g;
    $tmp=~s/,–/, –/g;
    $tmp=~s/([\.:,!\?\)])–/$1 -/g;
    $tmp=~s/\. </\.</g;

    $tmp=~s/xml: id/xml:id/g;

    $tmp=~s/#-{2,}//g;
    $tmp=~s/<p><\/p>//g;
    $tmp=~s/\s*<\/p><p>/<\/p>\n\t\t\t\t<p>/g;
    $tmp=~s/ +/ /g;

    #$tmp =~ s/„([^„“]+)„/„$1“/g;

    #$tmp=~s/<pb n="(\d+)"\/>(?:\r\n)*n*\s*<\/div>/<pb n="$1"\/>/gm;
    #$tmp=~s/<div type="Section">(?:\r\n)*\n*\s*<head>([^<]+)<\/head>/<div type="Section">\n\t\t\t\t<head>$1<\/head>\n\t\t\t\t<\/div>/gm;
    #$tmp=~s/\s*<pb n="(\d+)"\/>(?:\r\n)*\n*\s*<div type="Section">/<div type="Section">\n\t\t\t\t<pb n="$1"\/>/gm;

    $tmp=~s/<\/p><lg>/<\/p>\n\t\t\t\t<lg>/g;
    $tmp=~s/<\/p><\/div>/<\/p>\n\t\t\t\<\/div>/g;

    $tmp=~s/(<name[^>]*>) +/ $1/g;
    $tmp=~s/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*) +<\/name>/<\/name>$1 /g;
    $tmp=~s/…\s*<\/name>/<\/name>…/g;
    $tmp=~s/,\s*\./\./g;
    $tmp=~s/ +/ /g;

    $tmp=~s/NORMALERZBNORMALERZB(\d+)NORMALERZBNORMALERZB/\n\t\t\t\t\t<lb n="$1"\/> /g;
    $tmp=~s/TRENNENDERZBTRENNENDERZB(\d+)TRENNENDERZBTRENNENDERZB/<lb n="$1"\/>/g;  
    $tmp=~s/PAGE PAGE PAGE PAGE PAGE(\d+)PAGE PAGE PAGE PAGE PAGE/<pb n="$1"\/>/g;  #//<pb n=\"".$current_page."\"/>";  
    $tmp=~s/<\/p>(<pb n="\d+"\/>)/<\/p>\n\t\t\t\t$1/g;
    $tmp=~s/ (<pb n="\d+"\/>)/$1/g;
    $tmp=~s/<\/p>[\r\n]+\s+<p>(<pb n="\d+"\/>)<\/p>/$1<\/p>/g;

    $tmp=~s/<\/l>(<pb n="\d+"\/>)/$1<\/l>/g;

    $tmp=~s/ +/ /g;

    $tmp=~s/<a><name/<name/g;

    $tmp=~s/<head><\/head>//;

    my $sperr="";   
    if ($_[1]!~m/(?:04|07|11).1857/ && $_[1]!~m/(?:04|08).1858/)
    {
        while ($tmp=~m/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])((?:[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ][^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ~\–\-]){3,})/)
        {
            my $sperr_org=$1;
            my $sperr_edit=$1;
            my $rest="";
            $sperr_edit=~s/ //g;
            $sperr.=$sperr_edit."\n";
            if ($sperr_edit=~m/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)$/)
            {
                $rest=($1 ne "<"?" ":"").$1;
                $sperr_edit=~s/[^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+$//;
            }       
            else { $rest=" "; }
            $tmp=~s/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])\Q$sperr_org\E/<hi rend="letter-spacing">$sperr_edit<\/hi>$rest/;
        }   
        if ($sperr ne "")
        {
            open (OUT,">>:encoding(utf-8)", $_[2]."\\000_sperrschrift.txt");
            print OUT $_[1]."\n\t".$sperr;
            close OUT;
        }
    }

    $tmp=~s/>([^<])<\/name>\./>$1\.<\/name>/g;

    return $tmp;
}

sub go_go_gadget
{
    my $file_xml=$_[0];
    my $file_html='';   my $output='';  my $output_filename=''; 
    my $chapter=0;  my $div=0; my $p=0; my $last_p=0;  my $v=0;
    my $input_xml=''; my $chapter_type=''; my $written=0;
    my $page=0; my $started=0;

    (my $fn,my $pn)=fileparse $file_xml;
    return if ($fn=~m/instruqcia/);

    print "\tkonvertiere $fn\n";
    $fn=~s/\.txt//g;
    $fn=~s/(\d+)_/$1+/;
    $fn=~s/_/-/g;   $fn=~s/^([a-z]+)-/$1_/g;
    $sub_folder="";
    if ($fn=~m/^([^_]+_[a-zA-Z]+)/)
    {   
        $sub_folder=$1;     
    }

    $file_xml=~s/(?:\/|\\+)/\\/g;

    open (IN,"<:encoding(utf-8)", $file_xml) || die "konnte die datei nicht oeffnen: $!\n"; 
    $input_xml = do { local $/; <IN> } ;                        # Eingabedatei komplett in String einlesen
    close IN;

    # -----------------------------------------
    $input_xml=anfangs_verarbeitung($input_xml);
    # ----------------------------------------- 

    $div=0;
    my $last_line='';   
    my @lines=split(/\n/,$input_xml);   
    $line_cnt=0;    
    my $group_cnt=0;
    my $verse_cnt=0;
    my $pg='';
    my $first_page=0;
    my $last_page=0;
    my $has_chapters=0;
    my $ut=0;
    my $quote_open=0;
    my $section_cnt=0;
    my $chapter_cnt=0;

    $coded_lb=0;
    $chapter_type="Section";
    $has_chapters=1 if ($input_xml=~m/#\s*[cC]\s*[\dIVXMC]+[\–\-]?[\dIVXMC]*/);
    if ($has_chapters eq 0)
    {
        $output='<div type="Section">' ;
        #$chapter=1; 
    }

    $output='<div type="Content" n="1">'."\n";
    foreach my $line (@lines)
    {   
        $line=~s/^L\s*//;

        chomp($line); $line=~s/\n//g; $line=~s/\r//g;   $line=~s/(\s){2,}/$1/g; $line =~ s/^\s+//g; $line =~ s/\s+$//g;
        #$line=~s/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])([,;\.])([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/$1$2 $3/g;

        if ($has_chapters eq 1 && $line =~ m/^\s*#?\s*\|?[cC]\s*(\d+)\s*(.*)/)      # chapter
        {
            $output.="</note>" if ($ut == 1);
            if ($p eq 1) { $output.="</p>";   } 
            elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
            if ($started eq 1)
            {
                if ($div eq 1) { $output.="\n\t\t\t</div>"; }
                elsif ($chapter eq 1) { $output.="\n\t\t\t</div>"; }
            }

            my $title=$2;
            if ($title) { $title=~s/<ref target="#a(\d+)" type="noteAnchor">/<ref target="#a_$page\_$1" type="noteAnchor">/g; }
            $chapter_cnt=$1;
            $output.="\n\t\t\t".'<div type="Chapter" n="'.$chapter_cnt.'">'."\n\t\t\t\t<head>".($title?check_linebreak($title,$pg):$pg)."</head>";

            $chapter=1; $chapter_type="Chapter"; #$div=0;
            $p=0;  $written=0;  $v=0; $ut=0;
            $last_line=""; $started=0;
            $pg='';
        }
        elsif ($line =~ m/^\s*#\s*[pP]\s*(\d+)/)                # page break
        {
            if ($v ne 1 && $p eq 0) 
            { 
                if ($div == 0)
                {
                    if ($chapter_type eq "Section" || $has_chapters == 0) { $section_cnt++; $output.='<div type="Section" n="'.$section_cnt.'">'; }
                    else { $output.='<div type="Chapter" n="'.$chapter_cnt.'">'; }
                    $div=1;
                }
                $output.="<p>"; $p=1;  
            }
            #$output.="<pb n=\"".$1."\"/>"; 

            # --- detecting page errors
            my $current_page=$1; #0;            
            #if ($first_page > 0)
            #{
            #   $current_page=$1;
            #   if ($current_page-$last_page<1)
            #   {
            #       $current_page=$last_page+1;
            #       $page_errors.=$fn."\t".$last_page."\n";
            #   }               
            #   elsif ($current_page-$last_page>1)
            #   {
            #       $page_errors.=$fn."\t".$last_page."\n";
            #   }               
            #}
            #else
            #{
            #   $first_page=$1;
            #   $current_page=$1;
            #}  
            #$last_page=$current_page;
            # ----

            $pg.="PAGE PAGE PAGE PAGE PAGE".$current_page."PAGE PAGE PAGE PAGE PAGE";   
            #$p=0;
            $page=$1;
            #$written=0; 
            $last_line="";
            $line_cnt=0;
        }
        elsif ($line =~ m/\s*#[tT]\s*(.+)/)             # title
        {
            $output.="</note>" if ($ut == 1);
            if ($p eq 1) { $output.="</p>"; }
            elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }

            if (($chapter eq 1 || $div eq 1) && $chapter_type ne 'chapter')
            {
            #   if($chapter_type eq 'chapter') 
            #   {
            #       if ($started eq 1)
            #       {   $output.="\n\t\t\t</div>\n\t\t\t".'<div type="Chapter" n="'.$1.'">';}
            #       else { $output.='<div type="Chapter" n="'.$1.'">';}
            #   }
            #   else
            #   { 
                    if ($started eq 1) { $section_cnt++; $output.="\n\t\t\t</div>\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">'; }       
                    else { $section_cnt++; $output.="\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">'; }
            #   }               
            }
            else
            {
                #$section_cnt++;
                #$output.='<div type="Section" n="'.$section_cnt.'">';
                #$div=1;
            }
            #$line_cnt++;
            $output.="\n\t\t\t\t<head>".$pg.$1."</head>"; 
            $pg='';
            $p=0; $written=0; $v=0; $ut=0;
            $last_line=''; $started=1;
        }
        elsif ($line =~ m/#v\s*(.+)/)                           # verse
        {
            $output.="</note>" if ($ut == 1);
            if ($p eq 1) { $output.="</p>" ;}
            if ($v eq 0) { $group_cnt++; $verse_cnt=0; $output.="\n\t\t\t\t".'<lg n="'.$group_cnt.'">';  }
            $verse_cnt++;
            $last_line=$1;
            $line_cnt++;            
            $output.="\n\t\t\t\t\t".'<l n="'.$verse_cnt.'">'.$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</l>";         
            $p=0; 
            $written=1; $v=1;   
            $started=1; $pg=''; $ut=0;
        }
        elsif ($line=~m/#\s*\-{2,}/) #elsif ($line eq '' && $last_line ne '') # && $last_line!~m/[\.!\?]\s*$/)          # paragraph
        {           
            if ($written eq 1)
            {
                $output.="</note>" if ($ut == 1); 
                if ($p eq 1) { $output.="</p>"; $p=0;  }
                elsif ($v eq 1 && $written eq 0) { $output.="\n\t\t\t</lg>\n\t\t\t\t"; $v=0; }
            }               
            #if ($p eq 0 && $v eq 0) { $output.="\n\t\t\t\t<p>"; $p=1; }

            $written=0; $last_line=''; $ut=0;
        }
        elsif ($line =~ m/^(?:#d)?\s*(\d{4}\s*წ\.)$/ || $line=~m/^\s*(\d{4}(?: – \d+\s*წ*\.)?\s*)$/ || $line=~m/^\s*([0-9]+\s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+\s*[0-9]+\s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*)$/)          # dateline
        {
            if ($div eq 1 || $chapter eq 1)
            { 
                $output.="</note>" if ($ut == 1);
                if ($p eq 1) { $output.="</p>"; }
                elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
                $output.="\n\t\t\t</div>"; 
                $chapter=0; $div=0; $ut=0;
            }
            $line_cnt++;
            $output.="\n\t\t\t\t<div type=\"dateline\"><p>".$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</p></div>"; 
            $p=0;  $written=0; $v=0; $pg=''; 
            $last_line="";
        }
        elsif ($div eq 1 || $chapter eq 1 || ($div eq 0 && $chapter eq 0))
        {
            if ($line!~m/^\s*$/)
            {
                $output.="\n\t\t\t\t</lg>" if ($v eq 1);
                if ($div eq 0 && $chapter eq 0) { $div=1; $section_cnt++; $output.="\n\t\t\t".'<div type="Section" n="'.$section_cnt.'">';}
                if ($p eq 0) { $output.="\n\t\t\t\t<p>";  }
                $line=~s/\s*#\s*//g;

                # --- quotes
                $line=~s/([\.,;\?!:])„/$1“/g;       #„ “
                if ($line=~m/^\s*„/ && $line!~m/“/ && $line=~m/[\.\?!:]+\s*$/)
                {   
                    $line.="“"; 
                }               
                else 
                {
                    $line=~s/„//g;
                }
                $line=~s/“//g if ($line=~m/“/ && $line!~m/„/);
                # ---

                # --- ref
                $line=~s/<ref target="a(\d+)" type="noteAnchor">/<ref target="#a_$page\_$1" type="noteAnchor">/g;
                $line=~s/<note xml:id="a(\d+)" type="footnote">/<note xml:id="a_$page\_$1" type="footnote">/g;          
                # ---

                if ($line=~m/\|ut/)
                {
                    $line=~s/\|ut/<note type="comment">/;
                    $ut=1;
                }

                $output.=check_linebreak($line,$pg);
                #$line_cnt++;
                #if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+)\/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ\-]+[[:punct:]]*)/)
                #{                  
                #   my $tmp1=$1; my $tmp2=$2;
                #   my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
                #   $line=~s/\Q$tmp1\E\/\Q$tmp2\E/$tmp1$z$tmp2/; $line.=" ";
                #   if ($coded_lb eq 0)
                #   {
                #       $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
                #   }
                #   else
                #   {
                #       $output.=$pg.$line; 
                #   }
                #   $coded_lb=1;
                #}
                #else
                #{
                #   if ($coded_lb eq 0)
                #   {
                #       $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
                #   }
                #   else
                #   {
                #       $output.=$pg.$line;                     
                #   }
                #   $coded_lb=0;
                #}              
                #$output.=$pg." "."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$line;
                $last_line=$line;
                $p=1; $written=1; $v=0; $started=1;$pg='';
            }
        }
    }

    if ($p eq 1) { $output.="</p>";}
    elsif ($v eq 1) { $output.="\n\t\t\t\t</lg>"; }
    if ($div eq 1) { $output.="\n\t\t\t</div>"; }
    elsif ($chapter eq 1) { $output.="\n\t\t\t</div>"; }

    # -----------------------------------------
    $output=end_verarbeitung($output,$fn,$folder_out)."</div>";
    # -----------------------------------------

    #$fn=get_complete_filename($fn);    
    mkdir($folder_out."\\".$sub_folder,0777) if ($dev eq 0 && !(-d $folder_out."\\".$sub_folder));  
    my $txt='<text rend="Section" xml:lang="kat">';
    $txt='<text rend="'.($section_cnt?"Section ":"").'Chapter" xml:lang="kat">' if ($has_chapters eq 1);
    $output=header($fn,$folder_in,$folder_out)."\n\t".$txt.'    
        <body>
            '.$output.' 
        </body>
    </text>';

    $output=~s/(<body>(?:\r\n)*\s*<pb n="\d+"\/>)(?:\r\n)*\s*<\/div>/$1/g;

    $output_filename=$folder_out.($dev eq 0 ? "\\".$sub_folder : "")."\\".$fn.".xml";           
    open (OUT, ">:encoding(utf-8)", $output_filename);
    print OUT '<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE TEI [
    <!ENTITY header SYSTEM "'.$fn.'.teih">
    <!ENTITY text SYSTEM "'.$fn.'.txml">
]>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:gnc="http://iness.uib.no/ns/1.0">
    &header;
    &text;
</TEI>';
    close (OUT);
    $output_filename=~s/\.xml/\.txml/gi;
    open (OUT, ">:encoding(utf-8)",$output_filename) || die "konnte die ausgabedatei \"$output_filename\" nicht oeffnen: $!\n";
    print OUT $output;
    close OUT;          
}

sub main
{
    print "\nBeginne...\n";

    my $root="D:\\bla";
    my @startfolders=($root."\\");
    $folder_out="D:\\bla";
    foreach my $startfolder(@startfolders)
    {
        @folders=();
        get_files($startfolder);
        if (scalar(@folders)<1) { push(@folders,$startfolder); }
        $root=~s/0_Eingabe/1_Ausgabe\\1/;
        foreach $folder_in(@folders)
        {
            $page_errors='';
            $folder_out=$folder_in;
            $folder_out=~s/0_Eingabe/1_Ausgabe/;
            # creating subfolders too
            #my $tmp=$folder_out;
            #$tmp=~s/\Q$root\E//;           
            #my @arr_tmp=split("\\\\",$tmp);
            #$tmp="";
            #foreach my $dings (@arr_tmp)
            #{              
            #   next if ($dings eq '');
            #   $tmp.="\\".$dings;                              
            #   mkdir($root.$tmp,0777) if (!(-d $root.$tmp));
            #}#
            # ----- 

            $folder_out=~s/\\+/\\/g;
            $dev = 1;   # entwicklermodus an bei 1
            $folder_out=~s/1_Ausgabe.*/1_Ausgabe/ if ($dev eq 1);

            print "Ordner ".$folder_in."\n";
            foreach my $file_xml(<${folder_in}/*.txt>)
            {
                go_go_gadget($file_xml);
            }

            next;

            if ($page_errors ne '')
            {
                $folder_in=~m/0_Eingabe\\(.+)/;
                my $tmp=$1;
                $tmp=~s/\\+/__/g;           
                open (OUT, ">:encoding(utf-8)", $root."\\".$tmp.".txt") || die "\n\tPage errors to file ".$tmp.": ".$!."\n";
                print OUT $page_errors;
                close OUT;
            }
        }
    }

    print "Fertig!\n\n";
}

main();

但是有些文件处理时间太长。如果超过6秒，我想跳过其中任何一个步骤。这样，如果处理的文件转换时间太长，它会跳到下一个文件。有关如何通过超时执行此操作的任何建议？

Answer 1

我没有仔细检查你的代码，以告诉你超时代码的确切位置，但你应该能够使用Time::Out轻松完成你想要的任务。只需使用

use Time::Out 'timeout';

timeout 6 => sub {
  # code that you want to time out after 6 seconds goes here
}

你应该被设置。

Perl：如果任务耗时太长，如何跳过循环

1 个答案: