我试图从具有多个页面的网站中抓取数据,该代码适用于单次运行但是当我尝试将其置于循环中时,网站崩溃并且chrome说我内存不足。以下是我使用的javascript:
// an XMLHttpRequest
var xhr = null;
var page = 1;
function scrape()
{
// instantiate XMLHttpRequest object
try
{
xhr = new XMLHttpRequest();
}
catch (e)
{
xhr = new ActiveXObject("Microsoft.XMLHTTP");
}
// handle old browsers
if (xhr == null)
{
alert("Ajax not supported by your browser!");
return;
}
// get city
var city = document.getElementById("city").value;
// get data
xhr.onreadystatechange =
function()
{
// only handle loaded requests
if (xhr.readyState == 4)
{
if (xhr.status == 200)
{
// insert link into DOM
some stuff
}
else
alert("Error with Ajax call!");
}
}
while(xhr.responseText != ""done)
{
// construct URL
var url = "scrape_data.php?city=" + city + "+" + page ;
xhr.open("GET", url, true);
xhr.send(null);
page++;
}
}
这是相应的后端:
// get the q parameter
preg_match('/.*(?=\+)/',$_GET["city"],$city);
preg_match('/(?<=\+).*/',$_GET["city"],$page);
// scrape data from shiksha.com
$string = @file_get_contents("http://www.shiksha.com/b-tech/colleges/b-tech-colleges-".urlencode($city)."-{$page}");
if($page === 1)
{
// counting total number of pages
preg_match_all('/class=" linkpagination">/',$string,$result);
$GLOBALS["pages"] = sizeof($result[0]);
}
else $GLOBALS["pages"]--;
// passing the string for scraping data and storing in database
if($GLOBALS["pages"]=== 0)
print("loop");
现在我如何遍历所有页面,以及如何检查是否已解析所有页面(for循环中的条件)。提前感谢您的帮助。