Amazon Scraper Script适用于XAMPP Windows,但不适用于Linux上的PHP5 Cli

时间:2015-01-31 14:59:26

标签: php linux windows screen-scraping

我正在尝试使用以下代码搜索Amazon ASIN代码:

<?php

class Scraper {

const BASE_URL = "http://www.amazon.com";
private $categoryFile = "";
private $outputFile = "";
private $catArray;
private $currentPage = NULL;
private $asin = array();
private $categoriesMatched = 0;
private $categoryProducts = array();
private $pagesMatched = 0;
private $totalPagesMatched = 0;
private $productsMatched = 0;

public function __construct($categoryFile, $outputFile) {

    $this->categoryFile = $categoryFile;
    $this->outputFile = $outputFile;

}

public function run() {

    $this->readCategories($this->categoryFile);
    $this->setupASINArray($this->asin);

    $x = 1;

    foreach ($this->catArray as $cat) {

        $this->categoryProducts["$x"] = 0;

        if ($this->currentPage == NULL) {

            $this->currentPage = $cat;
            $this->scrapeASIN($this->currentPage, $x);
            $this->pagesMatched++;

        }           

        if ($this->getNextPageLink($this->currentPage)) {

            do {

                // next page found
                $this->pagesMatched++;
                $this->scrapeASIN($this->currentPage, $x);

            } while ($this->getNextPageLink($this->currentPage));

        }

        echo "Category complete: $this->pagesMatched Pages" . "\n";
        $this->totalPagesMatched += $this->pagesMatched;
        $this->pagesMatched = 0;
        $this->writeASIN($this->outputFile, $x);
        $x++;
        $this->currentPage = NULL;
        $this->categoriesMatched++;



    }

    $this->returnStats();


}

private function readCategories($categoryFile) {

    $catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);

    $this->catArray = $catArray;

}

private function setupASINArray($asinArray) {

    $x = 0;

    foreach ($this->catArray as $cat) {

        $asinArray["$x"][0] = "$cat";
        $x++;

    }

    $this->asin = $asinArray;

}

private function getNextPageLink($currentPage) {

    $document = new DOMDocument();

    $html = file_get_contents($currentPage);

    @$document->loadHTML($html);

    $xpath = new DOMXPath($document);

    $element = $xpath->query("//a[@id='pagnNextLink']/@href");

    if ($element->length != 0) {

        $this->currentPage = self::BASE_URL . $element->item(0)->value;
        return true;

    } else {

        return false;

    }


}

private function scrapeASIN($currentPage, $catNo) {

    $html = file_get_contents($currentPage);

    $regex = '~(?:www\.)?ama?zo?n\.(?:com|ca|co\.uk|co\.jp|de|fr)/(?:exec/obidos/ASIN/|o/|gp/product/|(?:(?:[^"\'/]*)/)?dp/|)(B[A-Z0-9]{9})(?:(?:/|\?|\#)(?:[^"\'\s]*))?~isx';

    preg_match_all($regex, $html, $asin);

    foreach ($asin[1] as $match) {

        $this->asin[$catNo-1][] = $match;

    }   


}

private function writeASIN($outputFile, $catNo) {

    $fh = fopen($outputFile, "a+");

    $this->fixDupes($catNo);
    $this->productsMatched += (count($this->asin[$catNo-1]) - 1);
    $this->categoryProducts["$catNo"] = (count($this->asin[$catNo-1]) - 1);

    flock($fh, LOCK_EX);

    $x = 0;

    foreach ($this->asin[$catNo-1] as $asin) {

        fwrite($fh, "$asin" . "\n");

        $x++;

    }



    flock($fh, LOCK_UN);

    fclose($fh);

    $x -= 1;

    echo "$x ASIN codes written to file" . "\n";

}

private function fixDupes($catNo) {

    $this->asin[$catNo-1] = array_unique($this->asin[$catNo-1], SORT_STRING);

}

public function returnStats() {

    echo "Categories matched: " . $this->categoriesMatched . "\n";
    echo "Pages parsed: " . $this->totalPagesMatched . "\n";
    echo "Products parsed: " . $this->productsMatched . "\n";
    echo "Category breakdown:" . "\n";

    $x = 1;

    foreach ($this->categoryProducts as $catProds) {

        echo "Category $x had $catProds products" . "\n";
        $x++;

    }

}

}

$scraper = new Scraper($argv[1], $argv[2]);
$scraper->run();

?>

但它适用于Windows上的XAMPP,但不适用于Linux。关于为什么会这样的任何想法?有时它会将0 ASIN写入文件,有时它只会在400多页的类别中搜索1页。但是Windows / XAMPP中的输出/功能完全正常。

任何想法都将不胜感激!

干杯 - 布莱斯

1 个答案:

答案 0 :(得分:0)

所以尝试改变这种方式,只是为了避免错误信息:

private function readCategories($categoryFile) {

if (file_exists($categoryFile)) {
    $catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);

    $this->catArray = $catArray;
} else {
    echo "File ".$categoryFile.' not exists!';
    $this->catArray = array();
}

}