用于获取Javascript标记内的数据的HTML源代码

时间:2013-04-10 15:15:09

标签: php javascript html

我正在尝试抓取网页的HTML源代码,然后用短语来获取Javascript代码中的数据。

该页面上的Javascript标记如下:

<script>
fullplayer("player", {
            src:"full_width_player.swf",
            wmode:"window"
        }, { 
    key: '#$0c4de1874473849ff8a',
    canvas: {
        backgroundGradient: "none",
        backgroundColor: "#000000"
    },
    audio: {
        provider: 'servstat',
        q: '128'
    },
    playlist: '/get.php?location=/audio/welcome.mp3',
    plugins: {
        youtube: { }
    }
});
</script>

我正在寻找一种方法来获取playlist:'*****',内的值,这意味着音轨的位置 - /get.php?location=/audio/welcome.mp3

这是否可以使用HTML DOM短语,或者我是否需要抓取Javascript标记内的所有内容并将数据读取为XML或类似的内容?

1 个答案:

答案 0 :(得分:1)

试试这是一个正则表达式

 preg_match_all('~playlist:[ ]*[\'|"](.*?)[\'|"]~si',$HTML,$Match);
 print_r($Match);

您的样本的输出:

 Array
 (
   [0] => Array
    (
        [0] => playlist: '/get.php?location=/audio/welcome.mp3'
    )

   [1] => Array
    (
        [0] => /get.php?location=/audio/welcome.mp3
    )

 )  

卷曲

function HeaderProc($response,$Run="",$String=1/*[Is 1 IF Use for String Mode ]*/){
          if($String==1){
             $response=explode("\r\n",$response);  
          }
          $PartHeader=0;
          $out[$PartHeader]=array();
          while(list($key,$val)=each($response)){
              $name='';
              $value='';
              $flag=false;
              for($i=0;$i<strlen($val);$i++){
                  if($val[$i]==":"){
                      $flag=true;
                      for($j=$i+1;$j<strlen($val);$j++){
                        if($val[$i]=="\r" and $val[$i+1]=="\n"){    
                            break;
                        }
                        $value.=$val[$j];
                      }
                      break;
                  }
                  $name.=$val[$i]; 
              }
              if($flag){
                if($name=='' and $value==''){
                    $PartHeader++;  
                }else{
                  if(isset($out[$PartHeader][$name])){
                    if(is_array($out[$PartHeader][$name])){   
                        $out[$PartHeader][$name][]=$value;
                    }else{
                        $T=$out[$PartHeader][$name];
                        $out[$PartHeader][$name]=array();
                        $out[$PartHeader][$name][0]=$T;  
                        $out[$PartHeader][$name][1]=$value;  
                    }
                  }else{
                    $out[$PartHeader][$name]=$value;
                  }
                }
              }else{
                if($name==''){
                    $PartHeader++;  
                }else{
                    if(isset($out[$PartHeader][$name])){ 
                      if(is_array($out[$PartHeader][$name])){   
                        $out[$PartHeader][$name][]=$value;
                      }else{
                        $T=$out[$PartHeader][$name];
                        $out[$PartHeader][$name]=array();
                        $out[$PartHeader][$name][0]=$T;  
                        $out[$PartHeader][$name][1]=$name;  
                      }
                    }else{
                        $out[$PartHeader][$name]=$name; 
                    }
                } 
              }
              if($Run!=""){
                $Run($name,$value);  
              }
          }
          return $out;
}

class cURL { 
    var $headers; 
    var $user_agent; 
    var $compression; 
    var $cookie_file; 
    var $proxy; 
    var $Cookie; 
    function CookieAnalysis($Cookie){//convert str cookie to array cookie 
       //echo $Cookie;
       $this->Cookie=array();
       preg_match("~(.*?)=(.*?);~si",' '.$Cookie.'; ',$M);
       $this->Cookie[trim($M[1])]=trim($M[2]);
       return $this->Cookie;
    }
    function cURL($cookies=false,$cookie='cookies.txt',$compression='gzip',$proxy='') {
         $this->headers[] = 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
         $this->headers[] = 'Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3'; 
         $this->headers[] = 'Accept-Encoding:gzip,deflate,sdch';
         $this->headers[] = 'Accept-Language:en-US,en;q=0.8';
         $this->headers[] = 'Cache-Control:max-age=0';
         $this->headers[] = 'Connection:keep-alive';
         $this->user_agent = 'User-Agent:Mozilla/5.0 (SepidarSoft [Organic Search Engine Crawler] Linux Edition) AppleWebKit/536.5 (KHTML, like Gecko) SepidarBrowser/1.0.100.52 Safari/536.5';
         $this->compression=$compression; 
         $this->proxy=$proxy; 
         $this->cookies=$cookies; 
         if ($this->cookies == TRUE) $this->cookie($cookie); 
    } 
    function cookie($cookie_file) { 
         if (file_exists($cookie_file)) { 
            $this->cookie_file=$cookie_file; 
         } else { 
            fopen($cookie_file,'w') or $this->error('The cookie file could not be opened. Make sure this directory has the correct permissions');
            $this->cookie_file=$cookie_file; 
            @fclose($this->cookie_file); 
         } 
    }
    function GET($url) { 
         $process = curl_init($url); 
         curl_setopt($process, CURLOPT_HTTPHEADER, $this->headers); 
         curl_setopt($process, CURLOPT_HEADER, 1); 
         curl_setopt($process, CURLOPT_USERAGENT, $this->user_agent); 
         if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEFILE, $this->cookie_file);
         if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEJAR, $this->cookie_file);
         curl_setopt($process,CURLOPT_ENCODING , $this->compression); 
         curl_setopt($process, CURLOPT_TIMEOUT, 30); 
         if ($this->proxy) curl_setopt($process, CURLOPT_PROXY, $this->proxy); 
         curl_setopt($process, CURLOPT_RETURNTRANSFER, 1); 
         curl_setopt($process, CURLOPT_FOLLOWLOCATION, 1); 
         $response = curl_exec($process);
         $header_size = curl_getinfo($process,CURLINFO_HEADER_SIZE);
         $result['Header'] = HeaderProc(substr($response, 0, $header_size),'',1);
         foreach($result['Header'] as $HeaderK=>$HeaderP){
           if(!is_array($HeaderP['Set-Cookie']))continue;
           foreach($HeaderP['Set-Cookie'] as $key=>$val){
             $result['Header'][$HeaderK]['Set-Cookie'][$key]=$this->CookieAnalysis($val);
           }
         }
         $result['Body'] = substr( $response, $header_size );
         $result['HTTP_State'] = curl_getinfo($process,CURLINFO_HTTP_CODE);
         $result['URL'] = curl_getinfo($process,CURLINFO_EFFECTIVE_URL); 
         curl_close($process); 
         return $result; 
    }
    function POST($url,$data) { 
         $process = curl_init($url); 
         curl_setopt($process, CURLOPT_HTTPHEADER, $this->headers); 
         curl_setopt($process, CURLOPT_HEADER, 1); 
         curl_setopt($process, CURLOPT_USERAGENT, $this->user_agent); 
         if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEFILE, $this->cookie_file);
         if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEJAR, $this->cookie_file);
         curl_setopt($process, CURLOPT_ENCODING , $this->compression); 
         curl_setopt($process, CURLOPT_TIMEOUT, 30); 
         if ($this->proxy) curl_setopt($process, CURLOPT_PROXY, $this->proxy); 
         curl_setopt($process, CURLOPT_POSTFIELDS, $data); 
         curl_setopt($process, CURLOPT_RETURNTRANSFER, 1); 
         curl_setopt($process, CURLOPT_FOLLOWLOCATION, 1); 
         curl_setopt($process, CURLOPT_POST, 1);
         $response = curl_exec($process); 
         $header_size = curl_getinfo($process,CURLINFO_HEADER_SIZE);
         $result['Header'] = HeaderProc(substr($response, 0, $header_size),'',1);
         foreach($result['Header'] as $HeaderK=>$HeaderP){
            if(!is_array($HeaderP['Set-Cookie']))continue;
           foreach($HeaderP['Set-Cookie'] as $key=>$val){
             $result['Header'][$HeaderK]['Set-Cookie'][$key]=$this->CookieAnalysis($val);
           }
         }
         $result['Body'] = substr( $response, $header_size );
         $result['HTTP_State'] = curl_getinfo($process,CURLINFO_HTTP_CODE);
         $result['URL'] = curl_getinfo($process,CURLINFO_EFFECTIVE_URL);
         curl_close($process); 
         return $result; 
    }
    function error($error) { 
         echo "<center><div style='width:500px;border: 3px solid #FFEEFF; padding: 3px; background-color: #FFDDFF;font-family: verdana; font-size: 10px'><b>cURL Error</b><br>$error</div></center>";
         die; 
    } 
 }  

样品:

  $cc = new cURL(); 
  $Data=$cc->get('http://www.yahoo.com');
  preg_match_all('~playlist:[ ]*[\'|"](.*?)[\'|"]~si',$Data['Body'],$Match);
  print_r($Match);