如何使用perl(正则表达式)比较两个文件(.xml和.html)的数据?

时间:2014-09-23 05:22:43

标签: regex perl

我尝试使用reachx为.xml文件使用foreach循环,因为有许多.xml文件和只有一个.html文件。我打开,阅读并关闭了目录。但是当在两个文件中搜索特定模式时,代码不会进入while / if循环。

xml数据:xml格式的#pattern

<gname>abc</gname>
<pname>xyz</pname>

html数据:html格式的#pattern

<p>ABC</p>
<p><i>xyz</i></p>

在这里,我需要在xml和html文件中区分abcxyz(区分大小写)。

open( F2, "<F2>" );
my $xml_list1 = "(.*)\.html";

这里括号内的数据也会在打印时出现。我想说文件名是abc.html所以我想保持“abc”可以互换,这样如果abc.html以外的任何文件名出现,我就不需要编写/修改代码。

close F2;
#print $xml_list1."\n";

foreach my $f (@filenames) {
    #print $f."\n";
    open( F1, "<F1>" );
    my $data = join( "", <F1> );
    close F1;
    my $filename = substr( $f, 0, index( $f, '.' ) );
    my $xml_list = $filename . ".xml";

    while ( $xml_list =~ m//ig ) {
        ...;
    }
}

代码没有进入while / if循环,似乎在读取文件名$xml_list时发现了一些错误。

我想在不使用解析器的情况下匹配这两个数据。

有人可以帮我解决。

更新: 代码:

#!/usr/bin/perl
use strict;
use Cwd;
use File::Copy;
use File::Basename;

my $path1=getcwd;

opendir(INP, "$path1\/Input");
my @out = grep(/.(xml)$/,readdir(INP));
my @out1 = grep(/.(html)$/,readdir(INP));
close INP;

foreach my $final(@out)
{
 my $filetobecopied = "Input\/".$final;
 my $newfile = $final;
 copy($filetobecopied, $newfile) or die "File cannot be copied.";
}

foreach my $final1(@out1)
{
 my $filetobecopied1 = "Input\/".$final1;
 my $newfile1 = $final1;
 copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
}

opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;

open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";

# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;

foreach my $f(@files)
 {
open(F1, "<$f") or die "Cannot open file: $files[0]";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print "$f\n";

open(F2, "<$f") or die "Cannot open file: $files[0]";
my $xml_listt="abc.html";
my $data1=join("", <F2>);
my $xml_list1=$data1;

print $xml_list1."\n";

while($xml_list=~m/(<personName>(.*?)<\/personName>)/isg)
{
        my $full=$1;
        my $name=$2;
        #print F6 $f."\t".$full."\n";       
if($full=~m/(<givenNames>(\w+)<\/givenNames>(\n)?<familyName>(\w+)<\/familyName>(\n)?(.*?))/isg)
        {
        my $fg=$1;
        my $gname=$2;
        my $fname=$4;
        #print F6 $f."\t".$gname."\t".$fname."\n";
        }
     }
While($xml_list1=~m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/igs)        
    {
    my $hfull=$1;
    print F6 $f."\n";   #.$hfull."\n";
    }
close F2;
close F1;
}
foreach my $del(@files)
{
unlink $del;
}

3 个答案:

答案 0 :(得分:0)

@flora:我已经修改了你的程序,并且我已经为你的程序提供了优化的解决方案。

修改后的代码:(修改程序)

 #!/usr/bin/perl
    use strict;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;

    opendir(INP, "$path1\/Input");
    my @out = grep(/\.(xml)$/,readdir(INP));
    my @out1 = grep(/\.(html)$/,readdir(INP));
    close INP;

    foreach my $final(@out)
    {
     my $filetobecopied = "Input\/".$final;
     my $newfile = $final;
     copy($filetobecopied, $newfile) or die "File cannot be copied.";
    }

    foreach my $final1(@out1)
    {
     my $filetobecopied1 = "Input\/".$final1;
     my $newfile1 = $final1;
     copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    }

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
    closedir DIR;

    open(F6, ">Ref.txt");
    print F6 "FileName\tError Instance\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {    
    open(F1, "<$f") or die "Cannot open file: $f";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    open(F2, "<$path1\/Input\/abc.html") or die "Cannot open file: abc.html - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    print $xml_list1."\n";

    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            my $name=$2;
            #print F6 $f."\t".$full."\n";       
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            {
            my $fg=$1;
            my $gname=$2;
            my $fname=$3;
            #print F6 $f."\t".$gname."\t".$fname."\n";
            }
         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
        my $hfull= $1;
        print F6 $f."\n";   #.$hfull."\n";
        }

    }
    foreach my $del(@files)
    {
     unlink $del;
    }

优化解决方案:

#!/usr/bin/perl
use strict;
use warnings;

my @files = grep {-f} glob("*.xml");
my @files1 = grep {-f} glob("*.html");
open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";
foreach my $f (@files)
 {
my $xml_list = do {
    local $/ = undef;
    open my $fh,'<',"$f" or die "Cannot open file: $f";
  <$fh>;
};

my $xml_list1 = do {
    local $/ = undef;
    open my $fh,'<',"abc.html" or die "Cannot open file: $f";
  <$fh>;
};


print $xml_list1."\n";

while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
        my $full=$1;
        my $name=$2;
        #print F6 $f."\t".$full."\n";       
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
        {
        my $fg=$1;
        my $gname=$2;
        my $fname=$3;
        #print F6 $f."\t".$gname."\t".$fname."\n";
        }
     }
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)"\s+SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
    {
    my $hfull= $1;
    print F6 $f . "\n";   #.$hfull."\n";
    }

}
foreach my $del(@files)
{
  unlink $del;
}

答案 1 :(得分:0)

@flora:试试这段代码。我忙于其他工作所以不能很快回复。现在这段代码将生成以下输出:

输入文件:(sample.xml)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="02"> 
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="03"> 
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName> 
</creator>
</creators>

<强> INPUTFILE(abc.html):

<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>

<强>输出:

FileName    MatchedString   Output
Matched Sugiura(sample.xml)->SUGIURA(abc.html)
Matched Muro(sample.xml)->Muro(abc.html)
Matched Akiyama(sample.xml)->Akiyama(abc.html)

<强>代码:

     #!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    #print $path1;
    opendir(INP, "$path1\/Input");
    my @out = grep(/\.(xml)$/,readdir(INP));
    closedir INP;
    opendir(INP, "$path1\/Input");
    my @out1 = grep(/\.(html)$/,readdir(INP));
    #print @out1;
    closedir INP;


    foreach my $final(@out)
    {
     my $filetobecopied = "Input\/".$final;
     my $newfile = $final;

     copy($filetobecopied, $newfile) or die "File cannot be copied.";
    }

    foreach my $final1(@out1)
    {
     my $filetobecopied1 = "Input\/".$final1;
     my $newfile1 = $final1;
     #print $final1;
     copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    }

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;
    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {   


    open(F1, "<$path1\/Input\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        print $f1 . "\n";
    open(F2, "<$path1\/Input\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
              push(@fname,$3);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }

    foreach my $a (@fname)
      {
        foreach my $b (@hfull)
           { 
            #print $b . "\n";
              if($b =~ m/$a/isg)
                 {     
                    my $line  = substr($b,index(lc($b),lc($a)),length($a));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }   
               }
         }      
      }     
    }       
   foreach my $del(@files)
    {
     unlink $del;
    }

优化代码:

#!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    #print $path1;
    #opendir(INP, "$path1\/Input");
    #my @out = grep(/\.(xml)$/,readdir(INP));
    #closedir INP;
    #opendir(INP, "$path1\/Input");
    #my @out1 = grep(/\.(html)$/,readdir(INP));
    #print @out1;
    #closedir INP;


    #foreach my $final(@out)
    #{
    #my $filetobecopied = "Input\/".$final;
    # my $newfile = $final;

     #copy($filetobecopied, $newfile) or die "File cannot be copied.";
    #}

    #foreach my $final1(@out1)
    #{
    # my $filetobecopied1 = "Input\/".$final1;
    # my $newfile1 = $final1;
     #print $final1;
    # copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    #}

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;
    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {   


    open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        print $f1 . "\n";
    open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
              push(@fname,$3);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }

    foreach my $a (@fname)
      {
        foreach my $b (@hfull)
           { 
            #print $b . "\n";
              if($b =~ m/$a/isg)
                 {     
                    my $line  = substr($b,index(lc($b),lc($a)),length($a));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }   
               }
         }      
      }     
    }       
   #foreach my $del(@files)
    #{
    # unlink $del;
    #}

答案 2 :(得分:0)

@flora:这是我发布的最终解决方案。这里来自xml文件,我已经与html中的数据结合并匹配。现在我向程序传递一个参数只有一个模式,我想检查它是否与xml和html文件匹配。例如,我已经将参数“Kazumitsu Sugiura”传递给程序,现在程序从xml获取值并将其组合为“Kazumitsu Sugiura”。此模式现在将检入html文件,如果匹配,则文件名将显示如下:

<强> INPUTFILE:(sample.xml中)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="02"> 
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="03"> 
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName> 
</creator>
</creators>

<强> INPUTFILE:(适用的test.xml)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
</creators>

<强> INPUTFILE:(适用的test.html)

<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>

<强>代码:

#!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    my $PatternName = $ARGV[0];     

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;

    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";        

    foreach my $f (@files)
     {   

    open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        #print $f1 . "\n";
    open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
       {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
                my $var = "$2 $3";
              push(@fname,$var);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }


    foreach my $a (@fname)
      {
         if($a =~ /$PatternName/i)
          {
        foreach my $b (@hfull)
           { 
              if($b =~ m/$PatternName/isg)
                 {     
                    print $PatternName . "\n";
                    my $line  = substr($b,index(lc($b),lc($PatternName)),length($PatternName));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }      
               } 
            }   
         }      
      }     
    }       

计划执行:

perl filename.pl“Kazumitsu Sugiura”

<强>输出:

Matched Kazumitsu Sugiura(sample.xml)->Kazumitsu SUGIURA(abc.html)
Matched Kazumitsu Sugiura(test.xml)->Kazumitsu SUGIURA(abc.html)