使用simplehtmldom PHPCrawl来解析数据

时间:2013-12-19 18:42:20

标签: php mysql simple-html-dom phpcrawl

我正在尝试使用PHPCrawl来抓取和收集URL,然后提供给simplehtmldom以从html中提取所需数据并存储在mysql数据库中。现在我收到错误

**

  

致命错误:调用未定义的方法simple_html_dom :: find()in   第44行/home/content/54/11109254/html/PHPCrawl_081/skunktest.php

**

任何人都可以帮忙解决我做错了什么,也许可以看一眼,看看我之前是否还有其他障碍?

    <?php
set_time_limit(1000000);

// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");

// Include Simplehtmldom
include("../simple_html_dom.php");

// Extend the class and override the handleDocumentInfo()-method 
class MyCrawler extends PHPCrawler 
{
  function handleDocumentInfo($DocInfo) 
  {

    // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
    if (PHP_SAPI == "cli") $lb = "\n";
    else $lb = "<br />";

    // Print the URL and the HTTP-status-Code
    echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;

    // Print the refering URL
    echo "Referer-page: ".$DocInfo->referer_url.$lb;

    // Print if the content of the document was be recieved or not
    if ($DocInfo->received == true)
      echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
    else
      echo "Content not received".$lb; 

    // Now you should do something with the content of the actual
    // received page or file ($DocInfo->source), we skip it in this example 

    $result = $DocInfo->url;



    $html = file_get_html($result);


    if($html && is_object($html) && isset($html->nodes)){

    $partnumber = $html->find('div[class=product-sku')->plaintext;

    $title = $html->find('.product-name')->plaintext;

    $productnote = $html->find('.product-note')->plaintext;

    $description = $html->find('.product-description')->innertext;


    foreach($html->find('.MagicZoomBigImageCont') as $img)
        {
            foreach($img->find('img') as $e)
                {
                    $image = $e;
                    $imagehref = $e->href;

                }
        }

    foreach($html->find('.p-related-image') as $rel)
        {
            foreach($rel->find('a') as $element)
                {
                    $rel1 = $element[0]->href;
                    $rel2 = $element[1]->href;
                    $rel3 = $element[2]->href;
                    $rel4 = $element[3]->href;
                    $rel5 = $element[4]->href;
                }
        }


    foreach($html->find('.p-related-name') as $name)
        {
            foreach($name->find('a') as $el)
                {
                    $rel1n = $el[0]->plaintext;
                    $rel2n = $el[1]->plaintext;
                    $rel3n = $el[2]->plaintext;
                    $rel4n = $el[3]->plaintext;
                    $rel5n = $el[4]->plaintext;
                }
        }

    $vehfitment = $html->find('div#appanel_1')->outertext;
    }else{echo "htmldom issue";}

    $manufacturer = "Skunk2";




//Make your connection to database
$con = mysql_connect($host,$username,$password);

//Check your connection
if (!$con) {
die("Could not connect: " . mysql_error());
}

//Select your database
$db_selected = mysql_select_db($database, $con);

//Check to make sure the database is there
if (!$db_selected) {
    die ('Can\'t use the db : ' . mysql_error());
//}

//Run query
$result = mysql_query("INSERT INTO $table(manufacturer, partnumber, title, productnote, description, rel1img, rel2img, rel3img, rel4img, rel5img, rel1name, rel2name, rel3name, rel4name, rel5name, image, vehfitment) VALUES('".$manufacturer."','".$partnumber."','".$title."','".$productnote."','".$description."','".$rel1."','".$rel2."','".$rel3."','".$rel4."','".$rel5."','".$rel1n."','".$rel2n."','".$rel3n."','".$rel4n."','".$rel5n."','".$imagehref."','".$vehfitment."')");

echo '.$manufacturer.<br>.$partnumber.<br>.$title.<br>.$productnote.<br>.$description.<br>.$rel1.<br>.$rel1n.<br>.$image.<br>.$imagehref.<br>.$vehfitment.';

for($k=0;$k<count($image);$k++){

echo '<img src="'.$image[$k].'"><br/>';

$isok=copy($image[$k] , dirname(__FILE__).'/desktop/skunk2'.($k+1).'.jpg');

if(isok==true){
   echo' success!';  
}
else{
  echo ' Fail';  
}
}

    echo $lb;

    flush();
  }
} 


// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process. 

$crawler = new MyCrawler();

// URL to crawl
$crawler->setURL("store.skunk2.com");

// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);

// Tell the crawler to stream everything but "text/html"-documents to a tmp-file
$crawler->addStreamToFileContentType("#^((?!text/html).)*$#");

//User Agent String

$crawler->setUserAgentString("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");

//0 - The crawler will follow EVERY link, even if the link leads to a different host or domain.
//If you choose this mode, you really should set a limit to the crawling-process (see limit-options),
//otherwise the crawler maybe will crawl the whole WWW!

//1 - The crawler only follow links that lead to the same domain like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will follow links to "http://www.foo.com/..."
//and "http://bar.foo.com/...", but not to "http://www.another-domain.com/...".

//2 - The crawler will only follow links that lead to the same host like the one in the root-url.
//E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will ONLY follow links to "http://www.foo.com/...", but not
//to "http://bar.foo.com/..." and "http://www.another-domain.com/...". This is the default mode.

//3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.
//E.g. if the root-url is "http://www.foo.com/bar/index.html", the crawler will follow links to "http://www.foo.com/bar/page.html" and
//"http://www.foo.com/bar/path/index.html", but not links to "http://www.foo.com/page.html"
$crawler->setFollowMode(1);

// Thats enough, now here we go
$crawler->go();

// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();

if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";

echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb; 
?>

2 个答案:

答案 0 :(得分:1)

试试这个

$html = new simple_html_dom();
$html->load_file($DocInfo->url;);

if($html && is_object($html) && isset($html->nodes)){
...
}

答案 1 :(得分:0)

使用以下链接可以顺利运行此链接源代码。

&#13;
&#13;
<?php
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL and the HTTP-status-Code
echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;
// Print the refering URL
echo "Referer-page: ".$DocInfo->referer_url.$lb;
// Print if the content of the document was be recieved or not
if ($DocInfo->received == true)
echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
else
echo "Content not received".$lb;
// Now you should do something with the content of the actual
// received page or file ($DocInfo->source), we skip it in this example
echo $lb;
flush();
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.php.net");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb;
?>
&#13;
&#13;
&#13;

http://quandaflow.com/php-web-crawler/ http://phpcrawl.cuab.de/example.html