我想抓取Google搜索结果..
如何在不定义绝对路径的情况下获取所有<div class="g">
个元素?
此模式//h3[@class="r"]
将获取所有h3
元素
此模式//div[@class="g"]
不返回任何内容
...
<div id="ires">
<ol>
<div class="srg">
<div class="g">
...
<h3 class="r"></h3>
...
</div>
<div class="g">
...
<h3 class="r"></h3>
...
</div>
</div>
</ol>
</div>
...
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://www.google.dk/search?q='.urlencode($query).'&start=0&num=100');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$html = curl_exec($ch);
$httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
libxml_use_internal_errors(true);
$doc = new DOMDocument('1.0', 'utf-8');
$doc->validateOnParse = false;
$doc->standalone = true;
$doc->preserveWhiteSpace = true;
$doc->strictErrorChecking = false;
$doc->substituteEntities = false;
$doc->recover = true;
$doc->formatOutput = true;
$doc->loadHTML($html);
libxml_clear_errors();
$div = $doc->getElementById('ires');
$xpath = new DOMXPath($doc);
foreach($xpath->query('//div[@class="g"]', $div) as $node){
print_r($node);
}
答案 0 :(得分:0)
这有效
try{
$ch = curl_init();
if (FALSE === $ch)
throw new Exception('failed to initialize');
curl_setopt($ch, CURLOPT_URL, "https://www.google.ie/search?q=whereNotnull+laravel+5&ie=utf-8&oe=utf-8&gws_rd=cr&ei=FuezVfedF6aC7gaYiJiYBw");
curl_setopt($ch, CURLOPT_USERAGENT, 'Firefox/14.0');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
if (FALSE === $content)
throw new Exception(curl_error($ch), curl_errno($ch));
curl_close($ch);
} catch(Exception $e) {
trigger_error(sprintf(
'Curl failed with error #%d: %s',
$e->getCode(), $e->getMessage()),
E_USER_ERROR);
}
$dom = new DOMDocument();
$dom->loadHTML($content);
$xpath = new DOMXPath($dom);
选择所有包含类&#34; g&#34;
的div $xpath_resultset = $xpath->query('//div[@class="g"]');
循环遍历$ hpath_result节点
foreach($xpath_resultset as $result){
回显节点作为文本
echo $dom->saveHTML($result);
echo节点为html代码
echo htmlspecialchars($dom->saveHTML($result)); // echo as html code
}