cURL代码中未定义的偏移量错误

时间:2011-08-07 16:43:26

标签: php curl screen-scraping

我正在构建一个php脚本来搜索和抓取使用curl的google页面,收到以下错误。

未定义的偏移量:第25行/home/content/53/7382753/html/Summer/wootsummer.php中的1

在卷曲设置中,违规行显示在下方。

curl_setopt($ch, CURLOPT_URL,$urls[$counter]);

任何建议或评论都会非常感激,因为我是新手。 作为参考,脚本wootsummer.php如下:

<html>
<body>

<?php

error_reporting(E_ALL);
set_time_limit (0);

$urls=explode("\n", $_POST['url']);

$target=$_POST['target'];

$allurls=count($urls);

//use the new tool box
require "ToolBoxA4.php";

for ( $counter = 0; $counter <= $allurls; $counter++) {

 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL,$urls[$counter]);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
 curl_setopt($ch, CURLOPT_CUSTOMREQUEST,'GET');
 curl_setopt ($ch, CURLOPT_HEADER, 1); 
 curl_exec ($ch); 
 $curl_scraped_page=curl_exec($ch); 

//call the new function parseA1
$arrOut = parseA1 ($curl_scraped_page);

//the output is an array with 3 items:  $arrOut[0] is RHS, $arrOut[1] is TOP, $arrOut[2] is NAT
//to look at the RHS

$curl_scraped_page=strtolower($curl_scraped_page);
$haystack=$curl_scraped_page;
if (strlen(strstr($haystack,$target))>0) {

$FileName = abs(rand(0,100000));
$FileHandle = fopen($FileName, 'w') or die("can't open file");
fwrite($FileHandle, $curl_scraped_page);

$hostname="************";
$username="******";
$password="*******";
$dbname="********";
$usertable="*********";

$con=mysql_connect($hostname,$username, $password) or die ("<html><script language='JavaScript'>alert('Unable to connect to database! Please try again later.'),history.go(-1)</script></html>");
mysql_select_db($dbname ,$con);

$right = explode(",", $arrOut[0]);
$top = explode(",", $arrOut[1]);

for ( $countforme = 0; $countforme <= 5; $countforme++) {

$topnow=$top[$countforme];

$query = "INSERT INTO happyturtle (time, ad1) VALUES ('$FileName','$topnow')";
mysql_query($query) or die('Error, insert query failed');

}

for ( $countforme = 0; $countforme <= 15; $countforme++) {

$rightnow = $right[$countforme];


$query = "INSERT INTO ****** (time, ad1) VALUES ('$FileName','$rightnow')";
mysql_query($query) or die('Error, insert query failed');

}

mysql_close($con);

echo '$FileNameSQL';


fclose($FileHandle);
}
curl_close($ch);

}

?>

</body>
</html>

上面提到的toolboxa4.php如下:

<?php

function strTrim ($strIn, $cutA, $cutB){
    //keeps what is between $cutA and $cutB
    $pieces = explode($cutA, $strIn, 2);
    $str1 = $pieces[1];  //keep everything after cutA 

    $pieces = explode($cutB, $str1, 2);
    $strOut = $pieces[0];  //keep everything before cutB            
    return $strOut;
}

function arrWords ($strIn, $theStart, $theEnd){
    //returns what is between $theStart and $theEnd
    $cutA = $theStart;
    $pieces = explode($cutA, $strIn);
    $pieces[0] = "";  //discard the first piece

    $cutB = $theEnd;
    foreach ($pieces as $key => $value) {
        $arrB = explode($cutB, $value, 2);
        $arrOut[$key] = $arrB[0];  //keep everything before cutB        
    }

    return $arrOut;
}

function arrElems ($strIn, $tag){
    //returns what is between <$tag> and </$tag>
    $cutA = "<$tag>";
    $pieces = explode($cutA, $strIn);
    $pieces[0] = "";  //discard the first piece

    $cutB = "</$tag>";
    foreach ($pieces as $key => $value) {
        $arrB = explode($cutB, $value, 2);
        $arrOut[$key] = $arrB[0];  //keep everything before cutB        
    }

    return $arrOut;
}

function arrElemAB ($strIn, $tagA, $tagB){
    //returns what is between <$tagA><$tagB> and </$tagB></$tagA>
    $cutA = "<$tagA><$tagB>";
    $pieces = explode($cutA, $strIn);
    $pieces[0] = "";  //discard the first piece

    $cutB = "</$tagB></$tagA>";
    foreach ($pieces as $key => $value) {
        $arrB = explode($cutB, $value, 2);
        $arrOut[$key] = $arrB[0];  //keep everything before cutB        
    }

    return $arrOut;
}

function DropTag ($strIn, $tag){
    //drops <$tag> and </$tag>.  $strIn can be a string or an array of strings.  $strOut is same type as $strIn.
    $strA = str_replace("<$tag>", "", $strIn);
    $strOut = str_replace("</$tag>", "", $strA);
    return $strOut;
}

function arrCompress ($arrIn){
    //deletes empty array entries
    if(!empty($arrIn)){
        foreach ($arrIn as $key => $value) {
            if ($value == "") {
                unset($arrIn[$key]);
            }       
        }
        $arrIn = array_values($arrIn);  
    }   
    return $arrIn;  
}

function arrDeDup ($arrIn){
    //changes duplicate array entries to empty string
    if(!empty($arrIn)){
        foreach ($arrIn as $key1 => $value1) {
            foreach ($arrIn as $key2 => $value2) {
                if ($key1 != $key2 and $value1 == $value2) {
                    $arrIn[$key2] = "";
                }   
            }
        }
    }   
    return $arrIn;  
}

function arrToString ($arrIn){
    //turns an array into a string
    $strOut = "";
    if(!empty($arrIn)){
        foreach ($arrIn as $value) {
            $strOut .= "$value<br>";                
        }
    }
    return $strOut; 
}

function arrContains ($arrIn, $strIn){
    //deletes array entries that do not contain $strIn.
    foreach ($arrIn as &$value) {
        if (stripos($value, $strIn) === false) {
            $value = "";
        }       
    }
    return $arrIn;
}

function arrNotContain ($arrIn, $strIn){
    //deletes array entries that contain $strIn.
    foreach ($arrIn as &$value) {
        if (stripos($value, $strIn) !== false) {
            $value = "";
        }       
    }
    return $arrIn;
}

function parseStrA1 ($strToParse){  
    //drop some tags
    $strA = DropTag ($strToParse, "b");

    //split the file into Top and RHS
    $strTOP = strTrim ($strA, "<body", "<div id=rhs_block");
    $strRHS = strTrim ($strA, "<div id=rhs_block", ">See your ad here");

    //get the elements using tags
    $arrTOP = arrElemAB ($strTOP, "span", "cite");
    $arrRHS = arrElems ($strRHS, "cite");  

    //remove empty elements
    $arrTOP = arrCompress ($arrTOP);
    $arrRHS = arrCompress ($arrRHS);

    //output results
    $strA = "TOP<br>" 
        . arrToString ($arrTOP) 
        . "<br><br>RHS<br>"
        . arrToString ($arrRHS);
    return $strA;
}

function parseA1 ($strToParse){ 
    //drop some tags
    $strA = DropTag ($strToParse, "b");

    //split the file into Top and RHS
    $strTOP = strTrim ($strA, "<body", "<div id=rhs_block");
    $strNAT = strTrim ($strA, "<body", "<div id=rhs_block");
    $strRHS = strTrim ($strA, "<div id=rhs_block", ">See your ad here");

    //get the elements using tags
    $arrTOP = arrElemAB ($strTOP, "span", "cite");
    $arrRHS = arrElems ($strRHS, "cite");

    //get the links
    $arrNAT = arrWords ($strNAT, '<a href="http://', '" ');
    $arrNAT = arrNotContain ($arrNAT, "oogle");

    //remove empty and duplicate elements
    $arrTOP = arrCompress ($arrTOP);
    $arrRHS = arrCompress ($arrRHS); 
    $arrNAT = arrDeDup ($arrNAT);
    $arrNAT = arrCompress ($arrNAT);

    //combine arrays into strings, comma separated values
    $arrOut[0] = implode(",", $arrRHS);
    $arrOut[1] = implode(",", $arrTOP);
    $arrOut[2] = implode(",", $arrNAT); 

    return $arrOut; 
}

?>      

1 个答案:

答案 0 :(得分:1)

for ( $counter = 0; $counter <= $allurls; $counter++) {

应该是

for ( $counter = 0; $counter < $allurls; $counter++) {

off-by-one