好的我尝试构建我的第一个正确的cUrl函数,我使用Nettuts cUrl(http://net.tutsplus.com/tutorials/php/techniques-and-resources-for-mastering-curl/)wordpress链接检查器作为基础,然后出于安全原因重新编写数据库访问权限。我不知道为什么它不起作用,因为我只重写了数据库访问部分和第32行的一些更改。我还将发布Nettuts的原始代码,我希望有所帮助。该代码用于检查文档(.PDF和.doc)的链接是否仍然存在或者是否需要更新。
任何帮助将不胜感激!
原始代码
// CONFIG
$db_host = 'localhost';
$db_user = 'root';
$db_pass = '';
$db_name = 'wordpress';
$excluded_domains = array(
'localhost', 'www.mydomain.com');
$max_connections = 10;
// initialize some variables
$url_list = array();
$working_urls = array();
$dead_urls = array();
$not_found_urls = array();
$active = null;
// connect to MySQL
if (!mysql_connect($db_host, $db_user, $db_pass)) {
die('Could not connect: ' . mysql_error());
}
if (!mysql_select_db($db_name)) {
die('Could not select db: ' . mysql_error());
}
// get all published posts that have links
$q = "SELECT post_content FROM wp_posts
WHERE post_content LIKE '%href=%'
AND post_status = 'publish'
AND post_type = 'post'";
$r = mysql_query($q) or die(mysql_error());
while ($d = mysql_fetch_assoc($r)) {
// get all links via regex
if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) {
foreach ($matches[1] as $url) {
// exclude some domains
$tmp = parse_url($url);
if (in_array($tmp['host'], $excluded_domains)) {
continue;
}
// store the url
$url_list []= $url;
}
}
}
// remove duplicates
$url_list = array_values(array_unique($url_list));
if (!$url_list) {
die('No URL to check');
}
// 1. multi handle
$mh = curl_multi_init();
// 2. add multiple URLs to the multi handle
for ($i = 0; $i < $max_connections; $i++) {
add_url_to_multi_handle($mh, $url_list);
}
// 3. initial execution
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 4. main loop
while ($active && $mrc == CURLM_OK) {
// 5. there is activity
if (curl_multi_select($mh) != -1) {
// 6. do work
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 7. is there info?
if ($mhinfo = curl_multi_info_read($mh)) {
// this means one of the requests were finished
// 8. get the info on the curl handle
$chinfo = curl_getinfo($mhinfo['handle']);
// 9. dead link?
if (!$chinfo['http_code']) {
$dead_urls []= $chinfo['url'];
// 10. 404?
} else if ($chinfo['http_code'] == 404) {
$not_found_urls []= $chinfo['url'];
// 11. working
} else {
$working_urls []= $chinfo['url'];
}
// 12. remove the handle
curl_multi_remove_handle($mh, $mhinfo['handle']);
curl_close($mhinfo['handle']);
// 13. add a new url and do work
if (add_url_to_multi_handle($mh, $url_list)) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
}
}
// 14. finished
curl_multi_close($mh);
echo "==Dead URLs==\n";
echo implode("\n",$dead_urls) . "\n\n";
echo "==404 URLs==\n";
echo implode("\n",$not_found_urls) . "\n\n";
echo "==Working URLs==\n";
echo implode("\n",$working_urls);
// 15. adds a url to the multi handle
function add_url_to_multi_handle($mh, $url_list) {
static $index = 0;
// if we have another url to get
if ($url_list[$index]) {
// new curl handle
$ch = curl_init();
// set the url
curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
// to prevent the response from being outputted
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// follow redirections
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// do not need the body. this saves bandwidth and time
curl_setopt($ch, CURLOPT_NOBODY, 1);
// add it to the multi handle
curl_multi_add_handle($mh, $ch);
// increment so next url is used next time
$index++;
return true;
} else {
// we are done adding new URLs
return false;
}
}
我的代码
<?php
/*Config*/
/*** mysql hostname ***/
$hostname = 'localhost';
/*** mysql username ***/
$username = 'root';
/*** mysql password ***/
$password = 'root';
/*curl setup of varibles*/
$excluded_domains = array(
'localhost', 'rollnstroll.se');
$max_connections = 10;
$url_list = array();
$working_urls = array();
$dead_urls = array();
$not_found_urls = array();
$active = null;
try {
$dbh = new PDO("mysql:host=$hostname;dbname=blankett", $username, $password);
$dbh->exec('SET CHARACTER SET utf8');
/*** echo a message saying we have connected ***/
/*** fetch into an PDOStatement object ***/
$sql = "SELECT link_forms FROM forms2 WHERE id = ?";
$stmt = $dbh->query($sql);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
// get all links via regex
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
foreach ($matches[1] as $url) {
// exclude some domains
$tmp = parse_url($url);
if (in_array($tmp['host'], $excluded_domains)) {
continue;
}
// store the url
$url_list []= $url;
}
}
// remove duplicates
$url_list = array_values(array_unique($url_list));
if (!$url_list) {
die('No URL to check');
}
}
catch(PDOException $e)
{
echo $e->getMessage();
}
// 1. multi handle
$mh = curl_multi_init();
// 2. add multiple URLs to the multi handle
for ($i = 0; $i < $max_connections; $i++) {
add_url_to_multi_handle($mh, $url_list);
}
// 3. initial execution
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 4. main loop
while ($active && $mrc == CURLM_OK) {
// 5. there is activity
if (curl_multi_select($mh) != -1) {
// 6. do work
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 7. is there info?
if ($mhinfo = curl_multi_info_read($mh)) {
// this means one of the requests were finished
// 8. get the info on the curl handle
$chinfo = curl_getinfo($mhinfo['handle']);
// 9. dead link?
if (!$chinfo['http_code']) {
$dead_urls []= $chinfo['url'];
// 10. 404?
} else if ($chinfo['http_code'] == 404) {
$not_found_urls []= $chinfo['url'];
// 11. working
} else {
$working_urls []= $chinfo['url'];
}
// 12. remove the handle
curl_multi_remove_handle($mh, $mhinfo['handle']);
curl_close($mhinfo['handle']);
// 13. add a new url and do work
if (add_url_to_multi_handle($mh, $url_list)) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
}
}
// 14. finished
curl_multi_close($mh);
echo "==Dead URLs==\n";
echo implode("\n",$dead_urls) . "\n\n";
echo "==404 URLs==\n";
echo implode("\n",$not_found_urls) . "\n\n";
echo "==Working URLs==\n";
echo implode("\n",$working_urls);
// 15. adds a url to the multi handle
function add_url_to_multi_handle($mh, $url_list) {
static $index = 0;
// if we have another url to get
if ($url_list[$index]) {
// new curl handle
$ch = curl_init();
// set the url
curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
// to prevent the response from being outputted
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// follow redirections
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// do not need the body. this saves bandwidth and time
curl_setopt($ch, CURLOPT_NOBODY, 1);
// add it to the multi handle
curl_multi_add_handle($mh, $ch);
// increment so next url is used next time
$index++;
return true;
} else {
// we are done adding new URLs
return false;
}
}
?>
我从原文重写的是数据库连接,这意味着我必须缩进我的代码,因为我使用PDO。我也改写了:
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
from
if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) {
我认为问题出在这里,但我缺乏技能并没有让我找到答案。 如果有更好的脚本来检查死链接,重定向和功能链接,请告诉我。
答案 0 :(得分:1)
我无法通过 $ result
看到你在做什么 原始代码中的有
$r = mysql_query($q) or die(mysql_error());
while ($d = mysql_fetch_assoc($r)) {
if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) {
...
仅在您的代码中
$stmt = $dbh->query($sql);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
因此,$d
和$d['link_forms']
不存在!!
所以if (preg_match_all(..., $d['link_forms'], ...))
返回False。
删除
$result = $stmt->fetch(PDO::FETCH_ASSOC);
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
并将其替换为
while ($d = $stmt->fetch(PDO::FETCH_ASSOC)) {
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
...
print_r($matches);
你得到了什么?
查看输出数组,您需要从$matches
获得哪些部分。
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
print_r($matches);
如果$d['link_forms']
中有更多网址,则此语句无法做到这一点。
foreach ($matches[1] as $url) {
然后你必须用
来完成数组foreach ($matches as $url) {
echo "part 0: " . $url[0] . "\n";
echo "part 1: " . $url[1] . "\n";
...