我正在努力使用我的抓取器脚本给我" 500内部服务器错误"。我很确定它与脚本的执行时间有关,但我不知道如何解决这个问题。我一直在做我的研究,我发现它通常与.htaccess文件有关,但我的文件夹中没有,我从未做过一个人。 所以我想这只是服务器以某种方式超时。 这是我的代码:
<?php
ini_set('max_execution_time', 0);
set_time_limit(0);
error_reporting(E_ALL);
ini_set('display_errors', 1);
$res5 = mysqli_query($con,"SELECT status FROM statuz ORDER BY id DESC LIMIT 1");
$row5 = mysqli_fetch_array($res5);
$status = $row5['status'];
if($status == '2' OR !isset($status)) {
$res10 = mysqli_query($con,"SELECT id FROM crawler_queue WHERE status != 0");
$row10 = mysqli_num_rows($res10);
if($row10 == 0) {
mysqli_query($con,"INSERT INTO crawler_queue (crawler_id, status)
(SELECT id,0 FROM crawlers)");
}
mysqli_query($con,"INSERT INTO statuz (status) VALUES ('1')");
$initial_res = mysqli_query($con,"SELECT crawler_id,id FROM crawler_queue WHERE status = '0' LIMIT 1");
while($initial_row = mysqli_fetch_array($initial_res)) {
$config_id = $initial_row['id'];
$start_crawler_id = $initial_row['crawler_id'];
mysqli_query($con,"UPDATE crawler_queue SET status = '1' WHERE crawler_id = '$start_crawler_id' ORDER BY id DESC LIMIT 1");
$res = mysqli_query($con,"SELECT href_xpath,content_xpath,url,image_xpath,forfatter_xpath,dato_xpath,custom_url,custom_content,crawlers.newspaper_id AS newspaper_id, crawler_urls.id AS crawler_id, crawler_urls.rss AS rss, categories.id AS category_id, newspapers.name AS newspaper_name, crawlers.id AS id
FROM crawlers
INNER JOIN crawler_urls
ON crawlers.newspaper_id = crawler_urls.newspaper_id
INNER JOIN categories
ON crawler_urls.category_id = categories.id
INNER JOIN newspapers
ON newspapers.id = crawlers.newspaper_id
WHERE crawlers.id = '$start_crawler_id'");
while($row = mysqli_fetch_array($res)) {
$href = $row['href_xpath'];
$url = $row['url'];
$newspaper_id = $row['newspaper_id'];
$crawler_id = $row['crawler_id'];
$content_xpath = $row['content_xpath'];
$image_xpath = $row['image_xpath'];
$rss = $row['rss'];
$dato_xpath = $row['dato_xpath'];
$forfatter_xpath = $row['forfatter_xpath'];
$category = $row['category_id'];
$newspaper_name = $row['newspaper_name'];
$custom_url = $row['custom_url'];
$custom_content = $row['custom_content'];
$id = $row['id'];
$data = array();
if($rss == '0') {
$html = new DOMDocument();
@$html->loadHtmlFile(''.$url.'');
$xpath = new DOMXPath($html);
$nodelist_href = $xpath->query( "$href" );
foreach($nodelist_href as $node)
{
if (substr($node->getAttribute("href"), 0, 1) === '/') {
if(isset($custom_url)) {
if($newspaper_id == 9) {
$data['href'][] = "http://www.bloomberg.com". '' . $node->getAttribute("href");
} else {
$data['href'][] = $url. '' . $node->getAttribute("href") . '' . $custom_url;
}
$data['img_url'][] = $url. '' . $node->getAttribute("href");
$data['title'][] = $node->nodeValue;
} else {
$data['href'][] = $url. '' . $node->getAttribute("href");
$data['title'][] = $node->nodeValue;
}
} else {
if(isset($custom_url)) {
$data['href'][] = $node->getAttribute("href"). '' . $custom_url;
$data['img_url'][] = $node->getAttribute("href");
$data['title'][] = $node->nodeValue;
} else {
$data['href'][] = $node->getAttribute("href");
$data['title'][] = $node->nodeValue;
}
}
}
} else {
$xml = simplexml_load_file($url);
foreach($xml->channel->item as $item) {
if($newspaper_id == 3) {
$data['href'][] = $item->link . '' . $custom_url;
$data['img_url'][] = $item->link;
} else {
$data['href'][] = $item->link;
}
$data['title'][] = $item->title;
}
}
if(isset($data)) {
for($i=0;$i<count($data['href']);$i++) {
$links_href = $data['href'][$i];
$links_title = $data['title'][$i];
$specific_img_url = $data['img_url'][$i];
if($newspaper_id == '3') {
$h = file_get_contents($links_href);
eval($custom_content);
$html = new DOMDocument();
@$html->loadHtml($h);
} else {
$html = new DOMDocument();
@$html->loadHtmlFile(''.$links_href.'');
}
$xpath = new DOMXPath($html);
$html->preserveWhiteSpace = false;
$nodelist_xpath = $xpath->query( "$content_xpath" );
$data['content'][$i] = '';
$content = $data['content'][$i];
foreach($nodelist_xpath as $node)
{
$content .= $html->saveHTML($node);
}
if(isset($specific_img_url)) {
$html = new DOMDocument();
@$html->loadHtmlFile(''.$specific_img_url.'');
$xpath = new DOMXPath($html);
$html->preserveWhiteSpace = false;
$img_href = $xpath->query( "$image_xpath" );
foreach($img_href as $node)
{
$is_img_there = $node->getAttribute("src");
if (isset($is_img_there)) {
$data['img_src'][$i] = $node->getAttribute("src");
}
}
} else {
$img_href = $xpath->query( "$image_xpath" );
foreach($img_href as $node)
{
$is_img_there = $node->getAttribute("src");
if (isset($is_img_there)) {
$data['img_src'][$i] = $node->getAttribute("src");
}
}
}
if(isset($data['img_src'][$i])) {
if($newspaper_id == '33') {
$img_src = $data['img_src'][$i];
$img_src = str_replace("/sites/","",$img_src);
} else {
$img_src = $data['img_src'][$i];
}
} else {
$img_src = "";
}
$content = preg_replace("/<img[^>]+\>/i","",$content);
$content = preg_replace("/<script[^>]+\>/i","",$content);
$content = preg_replace("/var startIndex.*/i","",$content);
$content = str_replace("'","'",$content);
//$content = preg_replace('/[^(\x20-\x7F)]*/','', $content);
/*
echo$newspaper_name;
echo"<br />";
echo"<ul>";
echo"<li>$links_title</li>";
echo"<br />";
echo$content;
echo"</ul>";
echo"<br /><br />";
*/
if($content != NULL) {
if($links_title != NULL) {
if($newspaper_id != NULL) {
mysqli_query($con,"INSERT INTO crawler_results
(title,content,newspaper_id,crawler_id,img_src,category,config_id,status)
VALUES ('$links_title','$content','$newspaper_id','$crawler_id','$img_src','$category','$config_id','3')");
$status = '2';
} else {
$status = '-1';
}
} else {
$status = '-1';
}
}
}
}
}
mysqli_query($con,"UPDATE crawler_queue SET status = '$status' WHERE crawler_id = '$start_crawler_id' ORDER BY id DESC LIMIT 1");
}
echo"Completed";
mysqli_query($con,"UPDATE statuz SET status = '2' ORDER BY id DESC LIMIT 1");
} else {
echo"Not ready yet";
}
?>
我感谢各种帮助!