我正在尝试抓取网页的HTML源代码,然后用短语来获取Javascript代码中的数据。
该页面上的Javascript标记如下:
<script>
fullplayer("player", {
src:"full_width_player.swf",
wmode:"window"
}, {
key: '#$0c4de1874473849ff8a',
canvas: {
backgroundGradient: "none",
backgroundColor: "#000000"
},
audio: {
provider: 'servstat',
q: '128'
},
playlist: '/get.php?location=/audio/welcome.mp3',
plugins: {
youtube: { }
}
});
</script>
我正在寻找一种方法来获取playlist:'*****',
内的值,这意味着音轨的位置 - /get.php?location=/audio/welcome.mp3
这是否可以使用HTML DOM短语,或者我是否需要抓取Javascript标记内的所有内容并将数据读取为XML或类似的内容?
答案 0 :(得分:1)
试试这是一个正则表达式
preg_match_all('~playlist:[ ]*[\'|"](.*?)[\'|"]~si',$HTML,$Match);
print_r($Match);
您的样本的输出:
Array
(
[0] => Array
(
[0] => playlist: '/get.php?location=/audio/welcome.mp3'
)
[1] => Array
(
[0] => /get.php?location=/audio/welcome.mp3
)
)
卷曲
function HeaderProc($response,$Run="",$String=1/*[Is 1 IF Use for String Mode ]*/){
if($String==1){
$response=explode("\r\n",$response);
}
$PartHeader=0;
$out[$PartHeader]=array();
while(list($key,$val)=each($response)){
$name='';
$value='';
$flag=false;
for($i=0;$i<strlen($val);$i++){
if($val[$i]==":"){
$flag=true;
for($j=$i+1;$j<strlen($val);$j++){
if($val[$i]=="\r" and $val[$i+1]=="\n"){
break;
}
$value.=$val[$j];
}
break;
}
$name.=$val[$i];
}
if($flag){
if($name=='' and $value==''){
$PartHeader++;
}else{
if(isset($out[$PartHeader][$name])){
if(is_array($out[$PartHeader][$name])){
$out[$PartHeader][$name][]=$value;
}else{
$T=$out[$PartHeader][$name];
$out[$PartHeader][$name]=array();
$out[$PartHeader][$name][0]=$T;
$out[$PartHeader][$name][1]=$value;
}
}else{
$out[$PartHeader][$name]=$value;
}
}
}else{
if($name==''){
$PartHeader++;
}else{
if(isset($out[$PartHeader][$name])){
if(is_array($out[$PartHeader][$name])){
$out[$PartHeader][$name][]=$value;
}else{
$T=$out[$PartHeader][$name];
$out[$PartHeader][$name]=array();
$out[$PartHeader][$name][0]=$T;
$out[$PartHeader][$name][1]=$name;
}
}else{
$out[$PartHeader][$name]=$name;
}
}
}
if($Run!=""){
$Run($name,$value);
}
}
return $out;
}
class cURL {
var $headers;
var $user_agent;
var $compression;
var $cookie_file;
var $proxy;
var $Cookie;
function CookieAnalysis($Cookie){//convert str cookie to array cookie
//echo $Cookie;
$this->Cookie=array();
preg_match("~(.*?)=(.*?);~si",' '.$Cookie.'; ',$M);
$this->Cookie[trim($M[1])]=trim($M[2]);
return $this->Cookie;
}
function cURL($cookies=false,$cookie='cookies.txt',$compression='gzip',$proxy='') {
$this->headers[] = 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
$this->headers[] = 'Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3';
$this->headers[] = 'Accept-Encoding:gzip,deflate,sdch';
$this->headers[] = 'Accept-Language:en-US,en;q=0.8';
$this->headers[] = 'Cache-Control:max-age=0';
$this->headers[] = 'Connection:keep-alive';
$this->user_agent = 'User-Agent:Mozilla/5.0 (SepidarSoft [Organic Search Engine Crawler] Linux Edition) AppleWebKit/536.5 (KHTML, like Gecko) SepidarBrowser/1.0.100.52 Safari/536.5';
$this->compression=$compression;
$this->proxy=$proxy;
$this->cookies=$cookies;
if ($this->cookies == TRUE) $this->cookie($cookie);
}
function cookie($cookie_file) {
if (file_exists($cookie_file)) {
$this->cookie_file=$cookie_file;
} else {
fopen($cookie_file,'w') or $this->error('The cookie file could not be opened. Make sure this directory has the correct permissions');
$this->cookie_file=$cookie_file;
@fclose($this->cookie_file);
}
}
function GET($url) {
$process = curl_init($url);
curl_setopt($process, CURLOPT_HTTPHEADER, $this->headers);
curl_setopt($process, CURLOPT_HEADER, 1);
curl_setopt($process, CURLOPT_USERAGENT, $this->user_agent);
if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEFILE, $this->cookie_file);
if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEJAR, $this->cookie_file);
curl_setopt($process,CURLOPT_ENCODING , $this->compression);
curl_setopt($process, CURLOPT_TIMEOUT, 30);
if ($this->proxy) curl_setopt($process, CURLOPT_PROXY, $this->proxy);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($process, CURLOPT_FOLLOWLOCATION, 1);
$response = curl_exec($process);
$header_size = curl_getinfo($process,CURLINFO_HEADER_SIZE);
$result['Header'] = HeaderProc(substr($response, 0, $header_size),'',1);
foreach($result['Header'] as $HeaderK=>$HeaderP){
if(!is_array($HeaderP['Set-Cookie']))continue;
foreach($HeaderP['Set-Cookie'] as $key=>$val){
$result['Header'][$HeaderK]['Set-Cookie'][$key]=$this->CookieAnalysis($val);
}
}
$result['Body'] = substr( $response, $header_size );
$result['HTTP_State'] = curl_getinfo($process,CURLINFO_HTTP_CODE);
$result['URL'] = curl_getinfo($process,CURLINFO_EFFECTIVE_URL);
curl_close($process);
return $result;
}
function POST($url,$data) {
$process = curl_init($url);
curl_setopt($process, CURLOPT_HTTPHEADER, $this->headers);
curl_setopt($process, CURLOPT_HEADER, 1);
curl_setopt($process, CURLOPT_USERAGENT, $this->user_agent);
if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEFILE, $this->cookie_file);
if ($this->cookies == TRUE) curl_setopt($process, CURLOPT_COOKIEJAR, $this->cookie_file);
curl_setopt($process, CURLOPT_ENCODING , $this->compression);
curl_setopt($process, CURLOPT_TIMEOUT, 30);
if ($this->proxy) curl_setopt($process, CURLOPT_PROXY, $this->proxy);
curl_setopt($process, CURLOPT_POSTFIELDS, $data);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($process, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($process, CURLOPT_POST, 1);
$response = curl_exec($process);
$header_size = curl_getinfo($process,CURLINFO_HEADER_SIZE);
$result['Header'] = HeaderProc(substr($response, 0, $header_size),'',1);
foreach($result['Header'] as $HeaderK=>$HeaderP){
if(!is_array($HeaderP['Set-Cookie']))continue;
foreach($HeaderP['Set-Cookie'] as $key=>$val){
$result['Header'][$HeaderK]['Set-Cookie'][$key]=$this->CookieAnalysis($val);
}
}
$result['Body'] = substr( $response, $header_size );
$result['HTTP_State'] = curl_getinfo($process,CURLINFO_HTTP_CODE);
$result['URL'] = curl_getinfo($process,CURLINFO_EFFECTIVE_URL);
curl_close($process);
return $result;
}
function error($error) {
echo "<center><div style='width:500px;border: 3px solid #FFEEFF; padding: 3px; background-color: #FFDDFF;font-family: verdana; font-size: 10px'><b>cURL Error</b><br>$error</div></center>";
die;
}
}
样品:
$cc = new cURL();
$Data=$cc->get('http://www.yahoo.com');
preg_match_all('~playlist:[ ]*[\'|"](.*?)[\'|"]~si',$Data['Body'],$Match);
print_r($Match);