所以我需要将所有txt文件加载到:http://orcahub.com/unchecked-proxy-list/作为一个txt文件,然后进入我的服务器,这是与Orcahub不同的一个;
由于某种原因,它不会工作。我不能让它实际上让HTML甚至做正则表达式。
我尝试了什么:
<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://orcahub.com/unchecked-proxy-list');
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // remove body
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$st = curl_exec($ch);
//curl_close($ch);
//preg_match_all("/(.*\.txt)/", $st, $out);
var_dump($ch);
?>
更新: 新问题,当我使用以下脚本时,我收到服务器错误500: 更新:发现此问题来自URL之后的换行符。
<?php
function disguise_curl($url) {
//Prepare Curl;
$curl = curl_init();
//Setup Headers (Firefox 2.0.0.6);
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";
//Setup Curl;
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_REFERER, 'http://orcahub.com/unchecked-proxy-list/');
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);
//Execute Curl;
$html = curl_exec($curl);
//End Curl;
curl_close($curl);
//Output the HTML;
return $html;
}
function rem_href($x) { return substr(strstr($x, '>'), strlen('>')); }
$response = disguise_curl('http://orcahub.com/unchecked-proxy-list/');
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $response, $matches, PREG_SET_ORDER );
foreach($matches as $value) {
$proxylists[] = 'http://orcahub.com/unchecked-proxy-list/'.rem_href($value[0]);
};
echo $proxylists[0];
$response = disguise_curl($proxylists[0]);
//Server Error 500 Here;
echo $response;
?>
答案 0 :(得分:1)
来自php.net的一个函数,它添加了标头来伪装调用,我为解析响应添加了一个正则表达式:
function disguise_curl($url)
{
$curl = curl_init();
// Setup headers - I used the same headers from Firefox version 2.0.0.6
// below was split up because php.net said the line was too long. :/
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_REFERER, 'http://www.google.com');
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 10);
$html = curl_exec($curl); // execute the curl command
curl_close($curl); // close the connection
return $html; // and finally, return $html
}
$response = disguise_curl('http://orcahub.com/unchecked-proxy-list/');
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $response, $matches, PREG_SET_ORDER );
foreach($matches as $value) {
var_dump($value);
};