我正在尝试使用以下代码搜索Amazon ASIN代码:
<?php
class Scraper {
const BASE_URL = "http://www.amazon.com";
private $categoryFile = "";
private $outputFile = "";
private $catArray;
private $currentPage = NULL;
private $asin = array();
private $categoriesMatched = 0;
private $categoryProducts = array();
private $pagesMatched = 0;
private $totalPagesMatched = 0;
private $productsMatched = 0;
public function __construct($categoryFile, $outputFile) {
$this->categoryFile = $categoryFile;
$this->outputFile = $outputFile;
}
public function run() {
$this->readCategories($this->categoryFile);
$this->setupASINArray($this->asin);
$x = 1;
foreach ($this->catArray as $cat) {
$this->categoryProducts["$x"] = 0;
if ($this->currentPage == NULL) {
$this->currentPage = $cat;
$this->scrapeASIN($this->currentPage, $x);
$this->pagesMatched++;
}
if ($this->getNextPageLink($this->currentPage)) {
do {
// next page found
$this->pagesMatched++;
$this->scrapeASIN($this->currentPage, $x);
} while ($this->getNextPageLink($this->currentPage));
}
echo "Category complete: $this->pagesMatched Pages" . "\n";
$this->totalPagesMatched += $this->pagesMatched;
$this->pagesMatched = 0;
$this->writeASIN($this->outputFile, $x);
$x++;
$this->currentPage = NULL;
$this->categoriesMatched++;
}
$this->returnStats();
}
private function readCategories($categoryFile) {
$catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);
$this->catArray = $catArray;
}
private function setupASINArray($asinArray) {
$x = 0;
foreach ($this->catArray as $cat) {
$asinArray["$x"][0] = "$cat";
$x++;
}
$this->asin = $asinArray;
}
private function getNextPageLink($currentPage) {
$document = new DOMDocument();
$html = file_get_contents($currentPage);
@$document->loadHTML($html);
$xpath = new DOMXPath($document);
$element = $xpath->query("//a[@id='pagnNextLink']/@href");
if ($element->length != 0) {
$this->currentPage = self::BASE_URL . $element->item(0)->value;
return true;
} else {
return false;
}
}
private function scrapeASIN($currentPage, $catNo) {
$html = file_get_contents($currentPage);
$regex = '~(?:www\.)?ama?zo?n\.(?:com|ca|co\.uk|co\.jp|de|fr)/(?:exec/obidos/ASIN/|o/|gp/product/|(?:(?:[^"\'/]*)/)?dp/|)(B[A-Z0-9]{9})(?:(?:/|\?|\#)(?:[^"\'\s]*))?~isx';
preg_match_all($regex, $html, $asin);
foreach ($asin[1] as $match) {
$this->asin[$catNo-1][] = $match;
}
}
private function writeASIN($outputFile, $catNo) {
$fh = fopen($outputFile, "a+");
$this->fixDupes($catNo);
$this->productsMatched += (count($this->asin[$catNo-1]) - 1);
$this->categoryProducts["$catNo"] = (count($this->asin[$catNo-1]) - 1);
flock($fh, LOCK_EX);
$x = 0;
foreach ($this->asin[$catNo-1] as $asin) {
fwrite($fh, "$asin" . "\n");
$x++;
}
flock($fh, LOCK_UN);
fclose($fh);
$x -= 1;
echo "$x ASIN codes written to file" . "\n";
}
private function fixDupes($catNo) {
$this->asin[$catNo-1] = array_unique($this->asin[$catNo-1], SORT_STRING);
}
public function returnStats() {
echo "Categories matched: " . $this->categoriesMatched . "\n";
echo "Pages parsed: " . $this->totalPagesMatched . "\n";
echo "Products parsed: " . $this->productsMatched . "\n";
echo "Category breakdown:" . "\n";
$x = 1;
foreach ($this->categoryProducts as $catProds) {
echo "Category $x had $catProds products" . "\n";
$x++;
}
}
}
$scraper = new Scraper($argv[1], $argv[2]);
$scraper->run();
?>
但它适用于Windows上的XAMPP,但不适用于Linux。关于为什么会这样的任何想法?有时它会将0 ASIN写入文件,有时它只会在400多页的类别中搜索1页。但是Windows / XAMPP中的输出/功能完全正常。
任何想法都将不胜感激!
干杯 - 布莱斯
答案 0 :(得分:0)
所以尝试改变这种方式,只是为了避免错误信息:
private function readCategories($categoryFile) {
if (file_exists($categoryFile)) {
$catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);
$this->catArray = $catArray;
} else {
echo "File ".$categoryFile.' not exists!';
$this->catArray = array();
}
}