我正在尝试使用perl脚本从服务器上的静态html文件中提取内容。我想拉出特定div的内容。我知道div的类名(“getme”)。 我可以使用HTML :: TreeBuilder-> look_down来访问div。如何删除div标签并仅获取其中的内容?
示例HTML
<body>
<div class="getme">
<h2>Some Header</h2>
<div class="another"><p>More text</p></div>
<div class="yetanother">text text text</div>
</div>
<div class="second">...</div>
</body>
到目前为止Perl
use strict;
use warnings;
use HTML::TreeBuilder;
use HTML::TagFilter;
my $unique_filename = '/path/to/saved/files/extracted_divs/' . get_timestamp();
my $guid_counter = 0;
my $field_sep = "|";
open FILEOUT, ">>", $unique_filename or die $!;
print FILEOUT "guid|published|url|title|body\n";
foreach my $file_name (@ARGV) {
my $tree = HTML::TreeBuilder->new;
my $filter = HTML::TagFilter->new(deny => { div => {class => ["getme"]} });
$tree->parse_file($file_name);
for my $subtree ($tree->look_down(_tag => "div", class => "getme")) {
#my $html = $filter->filter($subtree->as_HTML);
my $html = $subtree->as_HTML;
#steamline HTML
$html =~ s/(?<!\n)\z/\n/;
#echo file name to screen so we know something is happening
print $file_name . "\n";
#replace slashes with spaces
my $file_url = $file_name;
$file_name =~ s/\//-/g;
#remove ".html"
$file_name =~ s/.html//g;
#echo info to output file
print FILEOUT $guid_counter++ . $field_sep . time . $field_sep;
print FILEOUT $file_url . $field_sep . $file_name . $field_sep;
print FILEOUT $html;
}
$tree = $tree->delete;
}
close (FILEOUT);
过滤器只删除class属性。是否可以删除整个标记,或者有更好的方法吗?
答案 0 :(得分:1)
use Web::Query qw();
join '', Web::Query->new_from_html($html)->find('.getme > *')->html
返回字符串
<h2>Some Header</h2><div class="another"><p>More text</div><div class="yetanother">text text text</div>