我不解析这个网址:http://foldmunka.net
$ch = curl_init("http://foldmunka.net");
//curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
//curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); //not necessary unless the file redirects (like the PHP example we're using here)
$data = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
clearstatcache();
if ($data === false) {
echo 'cURL failed';
exit;
}
$dom = new DOMDocument();
$data = mb_convert_encoding($data, 'HTML-ENTITIES', "utf-8");
$data = preg_replace('/<\!\-\-\[if(.*)\]>/', '', $data);
$data = str_replace('<![endif]-->', '', $data);
$data = str_replace('<!--', '', $data);
$data = str_replace('-->', '', $data);
$data = preg_replace('@<script[^>]*?>.*?</script>@si', '', $data);
$data = preg_replace('@<style[^>]*?>.*?</style>@si', '', $data);
$data = mb_convert_encoding($data, 'HTML-ENTITIES', "utf-8");
@$dom->loadHTML($data);
$els = $dom->getElementsByTagName('*');
foreach($els as $el){
print $el->nodeName." | ".$el->getAttribute('content')."<hr />";
if($el->getAttribute('title'))$el->nodeValue = $el->getAttribute('title')." ".$el->nodeValue;
if($el->getAttribute('alt'))$el->nodeValue = $el->getAttribute('alt')." ".$el->nodeValue;
print $el->nodeName." | ".$el->nodeValue."<hr />";
}
我需要顺序 alt,title属性和简单文本,但是这个页面我无法访问body标签内的节点。
答案 0 :(得分:1)
我不确定我是否得到了这个脚本的功能 - 替换操作看起来像是在尝试卫生,但是我不知道为什么,如果你只是提取代码的某些部分 - 但是你有试过Simple HTML DOM Browser?它可以更容易地处理解析部分。看看例子。
答案 1 :(得分:1)
这是一个使用DomDocument和DOMXPath的解决方案。与使用Simple HTML DOM Parser的其他解决方案相比,它更短,运行速度更快(约100毫秒,约2300毫秒)。
<?php
function makePlainText($source)
{
$dom = new DOMDocument();
$dom->loadHtmlFile($source);
// use this instead of loadHtmlFile() to load from string:
//$dom->loadHtml('<html><title>Hello</title><body>Hello this site<img src="asdasd.jpg" alt="alt attr" title="title attr"><a href="open.php" alt="alt attr" title="title attr">click</a> Some text.</body></html>');
$xpath = new DOMXPath($dom);
$plain = '';
foreach ($xpath->query('//text()|//a|//img') as $node)
{
if ($node->nodeName == '#cdata-section')
continue;
if ($node instanceof DOMElement)
{
if ($node->hasAttribute('alt'))
$plain .= $node->getAttribute('alt') . ' ';
if ($node->hasAttribute('title'))
$plain .= $node->getAttribute('title') . ' ';
}
if ($node instanceof DOMText)
$plain .= $node->textContent . ' ';
}
return $plain;
}
echo makePlainText('http://foldmunka.net');
答案 2 :(得分:1)
这是一个仅用于比较的Simple Html DOM Parser解决方案。它的输出类似于DomDocument solution,但这个更复杂,运行速度慢得多(对于DomDocument的~100ms,大约2300ms),所以我不建议使用它:
已更新以使用<img>
元素中的<a>
元素。
<?php
require_once('simple_html_dom.php');
// we are needing this because Simple Html DOM Parser's callback handler
// doesn't handle arguments
static $processed_plain_text = '';
define('LOAD_FROM_URL', 'loadfromurl');
define('LOAD_FROM_STRING', 'loadfromstring');
function callback_cleanNestedAnchorContent($element)
{
if ($element->tag == 'a')
$element->innertext = makePlainText($element->innertext, LOAD_FROM_STRING);
}
function callback_buildPlainText($element)
{
global $processed_plain_text;
$excluded_tags = array('script', 'style');
switch ($element->tag)
{
case 'text':
// filter when 'text' is descendant of 'a', because we are
// processing the anchor tags with the required attributes
// separately at the 'a' tag,
// and also filter out other unneccessary tags
if (($element->parent->tag != 'a') && !in_array($element->parent->tag, $excluded_tags))
$processed_plain_text .= $element->innertext . ' ';
break;
case 'img':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
break;
case 'a':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
$processed_plain_text .= $element->innertext . ' ';
break;
}
}
function makePlainText($source, $mode = LOAD_FROM_URL)
{
global $processed_plain_text;
if ($mode == LOAD_FROM_URL)
$html = file_get_html($source);
elseif ($mode == LOAD_FROM_STRING)
$html = str_get_dom ($source);
else
return 'Wrong mode defined in makePlainText: ' . $mode;
$html->set_callback('callback_cleanNestedAnchorContent');
// processing with the first callback to clean up the anchor tags
$html = str_get_html($html->save());
$html->set_callback('callback_buildPlainText');
// processing with the second callback to build the full plain text with
// the required attributes of the 'img' and 'a' tags, and excluding the
// unneccessary ones like script and style tags
$html->save();
$return = $processed_plain_text;
// cleaning the global variable
$processed_plain_text = '';
return $return;
}
//$html = '<html><title>Hello</title><body>Hello <span>this</span> site<img src="asdasd.jpg" alt="alt attr" title="title attr"><a href="open.php" alt="alt attr" title="title attr">click <span><strong>HERE</strong></span><img src="image.jpg" title="IMAGE TITLE INSIDE ANCHOR" alt="ALTINACNHOR"></a> Some text.</body></html>';
echo makePlainText('http://foldmunka.net');
//echo makePlainText($html, LOAD_FROM_STRING);