我正在尝试使用PHPCrawl来抓取和收集URL,然后提供给simplehtmldom以从html中提取所需数据并存储在mysql数据库中。现在我收到错误
**
致命错误:调用未定义的方法simple_html_dom :: find()in 第44行/home/content/54/11109254/html/PHPCrawl_081/skunktest.php
**
任何人都可以帮忙解决我做错了什么,也许可以看一眼,看看我之前是否还有其他障碍?
<?php
set_time_limit(1000000);
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Include Simplehtmldom
include("../simple_html_dom.php");
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL and the HTTP-status-Code
echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;
// Print the refering URL
echo "Referer-page: ".$DocInfo->referer_url.$lb;
// Print if the content of the document was be recieved or not
if ($DocInfo->received == true)
echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
else
echo "Content not received".$lb;
// Now you should do something with the content of the actual
// received page or file ($DocInfo->source), we skip it in this example
$result = $DocInfo->url;
$html = file_get_html($result);
if($html && is_object($html) && isset($html->nodes)){
$partnumber = $html->find('div[class=product-sku')->plaintext;
$title = $html->find('.product-name')->plaintext;
$productnote = $html->find('.product-note')->plaintext;
$description = $html->find('.product-description')->innertext;
foreach($html->find('.MagicZoomBigImageCont') as $img)
{
foreach($img->find('img') as $e)
{
$image = $e;
$imagehref = $e->href;
}
}
foreach($html->find('.p-related-image') as $rel)
{
foreach($rel->find('a') as $element)
{
$rel1 = $element[0]->href;
$rel2 = $element[1]->href;
$rel3 = $element[2]->href;
$rel4 = $element[3]->href;
$rel5 = $element[4]->href;
}
}
foreach($html->find('.p-related-name') as $name)
{
foreach($name->find('a') as $el)
{
$rel1n = $el[0]->plaintext;
$rel2n = $el[1]->plaintext;
$rel3n = $el[2]->plaintext;
$rel4n = $el[3]->plaintext;
$rel5n = $el[4]->plaintext;
}
}
$vehfitment = $html->find('div#appanel_1')->outertext;
}else{echo "htmldom issue";}
$manufacturer = "Skunk2";
//Make your connection to database
$con = mysql_connect($host,$username,$password);
//Check your connection
if (!$con) {
die("Could not connect: " . mysql_error());
}
//Select your database
$db_selected = mysql_select_db($database, $con);
//Check to make sure the database is there
if (!$db_selected) {
die ('Can\'t use the db : ' . mysql_error());
//}
//Run query
$result = mysql_query("INSERT INTO $table(manufacturer, partnumber, title, productnote, description, rel1img, rel2img, rel3img, rel4img, rel5img, rel1name, rel2name, rel3name, rel4name, rel5name, image, vehfitment) VALUES('".$manufacturer."','".$partnumber."','".$title."','".$productnote."','".$description."','".$rel1."','".$rel2."','".$rel3."','".$rel4."','".$rel5."','".$rel1n."','".$rel2n."','".$rel3n."','".$rel4n."','".$rel5n."','".$imagehref."','".$vehfitment."')");
echo '.$manufacturer.<br>.$partnumber.<br>.$title.<br>.$productnote.<br>.$description.<br>.$rel1.<br>.$rel1n.<br>.$image.<br>.$imagehref.<br>.$vehfitment.';
for($k=0;$k<count($image);$k++){
echo '<img src="'.$image[$k].'"><br/>';
$isok=copy($image[$k] , dirname(__FILE__).'/desktop/skunk2'.($k+1).'.jpg');
if(isok==true){
echo' success!';
}
else{
echo ' Fail';
}
}
echo $lb;
flush();
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("store.skunk2.com");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Tell the crawler to stream everything but "text/html"-documents to a tmp-file
$crawler->addStreamToFileContentType("#^((?!text/html).)*$#");
//User Agent String
$crawler->setUserAgentString("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
//0 - The crawler will follow EVERY link, even if the link leads to a different host or domain.
//If you choose this mode, you really should set a limit to the crawling-process (see limit-options),
//otherwise the crawler maybe will crawl the whole WWW!
//1 - The crawler only follow links that lead to the same domain like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will follow links to "http://www.foo.com/..."
//and "http://bar.foo.com/...", but not to "http://www.another-domain.com/...".
//2 - The crawler will only follow links that lead to the same host like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will ONLY follow links to "http://www.foo.com/...", but not
//to "http://bar.foo.com/..." and "http://www.another-domain.com/...". This is the default mode.
//3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.
//E.g. if the root-url is "http://www.foo.com/bar/index.html", the crawler will follow links to "http://www.foo.com/bar/page.html" and
//"http://www.foo.com/bar/path/index.html", but not links to "http://www.foo.com/page.html"
$crawler->setFollowMode(1);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb;
?>
答案 0 :(得分:1)
试试这个
$html = new simple_html_dom();
$html->load_file($DocInfo->url;);
if($html && is_object($html) && isset($html->nodes)){
...
}
答案 1 :(得分:0)
使用以下链接可以顺利运行此链接源代码。
<?php
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL and the HTTP-status-Code
echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;
// Print the refering URL
echo "Referer-page: ".$DocInfo->referer_url.$lb;
// Print if the content of the document was be recieved or not
if ($DocInfo->received == true)
echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
else
echo "Content not received".$lb;
// Now you should do something with the content of the actual
// received page or file ($DocInfo->source), we skip it in this example
echo $lb;
flush();
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.php.net");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb;
?>
&#13;
http://quandaflow.com/php-web-crawler/ http://phpcrawl.cuab.de/example.html