我有一堆网站存储为3个mysql表中的字符串。我的脚本将它们放入数组中,解析它,提取所有链接并将它们分类为2个表。它在3个完全分类的相同模块中被打破。
整个过程每30秒执行一次操作。
出于某种原因,只是第一次按预期工作,以后没有任何事情发生。
在我开始使用代码之前,我为折旧的mysql道歉,这个脚本只会在本地机器上使用,我会在适当的时候更新它。
这是我的代码:
$i=1;
$domain1 = 'example1.com';
$domain2 = 'example2.com';
$domain3 = 'example3.com';
$robots1 = array("url1",
"url2",
"url3");
$robots2 = array("url1",
"url2",
"url3");
$robots3 = array("url1",
"url2",
"url3");
require_once 'Normalizer.php';
$conn = mysql_connect('localhost:3306','user', 'pass', true );
mysql_select_db( 't1000', $conn );
while ($i<=50000) {
$query = 'SELECT * FROM dump1';
$result1=mysql_query( $query, $conn );
$strings1=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings1, $row["link"]);
}
$query = 'TRUNCATE TABLE dump1';
$delete=mysql_query( $query, $conn );
$query = 'SELECT * FROM dump2';
$result1=mysql_query( $query, $conn );
$strings2=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings2, $row["link"]);
}
$query = 'TRUNCATE TABLE dump2';
$delete=mysql_query( $query, $conn );
$query = 'SELECT * FROM dump3';
$result1=mysql_query( $query, $conn );
$strings3=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings3, $row["link"]);
}
$query = 'TRUNCATE TABLE dump3';
$delete=mysql_query( $query, $conn );
// Module 1 start
$ii=0;
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
foreach ($strings1 as $value)
{
$input=$strings1[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain1)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain1)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain1)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain1.$href1;}
if (in_array($href1, $robots1)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist1 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist1 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 1 ends
// Module 2 start
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
$ii=0;
foreach ($strings2 as $value)
{
$input=$strings2[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain2)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain2)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain2)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain2.$href1;}
if (in_array($href1, $robots2)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist2 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist2 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 2 Ends
// Module 3 start
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
$ii=0;
foreach ($strings3 as $value)
{
$input=$strings3[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain3)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain3)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain3)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain3.$href1;}
if (in_array($href1, $robots3)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist3 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist3 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 3 ends
sleep(30);
$i=$i++;
}
我现在试图解决它几天,混合了一些东西,但没有运气......
答案 0 :(得分:1)
尝试在do {} while();
中包装它即:
$i = 1;
do {
echo "some crap $i<br>\n";
} while($i<=50000);