一点背景:
我在网上玩了一个叫做doomlord的游戏......
我在一个家族。我试图向与我们相关的球员提供统计数据
为此,我必须从一个我只能在登录后才能访问的网页中检索玩家的名字。页面是:
www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak
但这只会带你到游戏的登录页面,而不是真正的页面。
要进入真实页面,我使用cURL(登录并获取一个允许我漫游页面的cookie)。 要从表中检索数据,我使用JS_extractor类。
问题:
它们分开工作,但是当它们一起使用时,它们会被废弃。
我尝试了什么:
1)
我使用了cURL并证明我能够检索我需要的页面
见这里:doomlord.tamtek.net/gettable.php
2)
我添加了js_extractor和cURL获取页面,但js_extractor不检索数据
见这里:doomlord.tamtek.net/gettable2.php
3)
我手动登录到doomlord网站,我转到了我需要的页面,我查看了然后“复制并粘贴”了源代码。然后我将源代码作为html页面上传到我的网站,在那里我使用了js_extractor,它完美地将我需要的表格放到一个数组中以便进一步进行操作...
页面来源:doomlord.tamtek.net/doomtest2.html
js_extractor结果:doomlord.tamtek.net/gettable3.php
代码:
(请原谅一些额外的写出,但我试图让它尽可能地视觉化)
这是我使用的cURL代码:
$ckfile = tempnam ("tmp", "cookie.tmp");
$fields_string='';
$fields = array(
'username'=>urlencode(SITE_USER),
'jelszo'=>urlencode(SITE_PASS),
'vilag'=>urlencode(SITE_WORLD),
'tev'=>urlencode(SITE_TEV),
);
foreach($fields as $key=>$value) { $fields_string .= $key.'='.$value.'&'; }
//rtrim($fields_string,'&');
$fields_string=substr($fields_string,0,-1);
//print_r($fields_string);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net');
curl_setopt($ch,CURLOPT_COOKIEJAR,$ckfile);
curl_setopt($ch,CURLOPT_POST,count($fields));
curl_setopt($ch,CURLOPT_POSTFIELDS,$fields_string);
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
curl_close($ch);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak');
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_COOKIEFILE,$ckfile);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
print_r($connect);
// print_r($response1);
curl_close($ch);
这是cURL和js_extractor代码:
<?php
session_start();
require_once('inc/constant.php');
require_once('inc/function.php'); //basic functions
$ckfile = tempnam ("tmp", "cookie.tmp");
$fields_string='';
$fields = array(
'username'=>urlencode(SITE_USER),
'jelszo'=>urlencode(SITE_PASS),
'vilag'=>urlencode(SITE_WORLD),
'tev'=>urlencode(SITE_TEV),
);
foreach($fields as $key=>$value) { $fields_string .= $key.'='.$value.'&'; }
//rtrim($fields_string,'&'); //tried both commented out and not
$fields_string=substr($fields_string,0,-1);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net');
curl_setopt($ch,CURLOPT_COOKIEJAR,$ckfile);
curl_setopt($ch,CURLOPT_POST,count($fields));
curl_setopt($ch,CURLOPT_POSTFIELDS,$fields_string);
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
curl_close($ch);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak');
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_COOKIEFILE,$ckfile);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
print_r($connect); // just to show that i have connected
// print_r($response1);
set_include_path(get_include_path() . PATH_SEPARATOR . './library/');
require_once 'JS/Extractor.php';
//$extractor = new JS_Extractor(file_get_contents('http://www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak'));
$extractor = new JS_Extractor(file_get_contents($connect)); // tried both ways
echo "</br />";
echo "This is the extractor:</br />";
var_dump($extractor);
echo "</br />";
echo "</br />";
$body = $extractor->query("body")->item(0);
echo "</br />";
echo "This is the body:</br />";
var_dump($body);
echo "</br />";
echo "</br />";
$table = $body->query("//table[@class=\"rstatisztika_tabla\"]")->item(0);
echo "</br />";
echo "This is the table:</br />";
var_dump($table);
echo "</br />";
echo "</br />";
$data = $table->extract(array("tr", "td"));
echo "</br />";
echo "This is the data:</br />";
var_dump($data);
echo "</br />";
echo "</br />";
curl_close($ch);
//}
?>
这是我在我的网站上放置源码后用来获取表格的js_extactor代码:
<?php
set_include_path(get_include_path() . PATH_SEPARATOR . './library/');
require_once 'JS/Extractor.php';
$extractor = new JS_Extractor(file_get_contents('http://doomlord.tamtek.net/doomtest2.html'));
echo "</br />";
echo "This is the extractor:</br />";
var_dump($extractor);
echo "</br />";
echo "</br />";
$body = $extractor->query("body")->item(0);
echo "</br />";
echo "This is the body:</br />";
var_dump($body);
echo "</br />";
echo "</br />";
$table = $body->query("//table[@class=\"rstatisztika_tabla\"]")->item(0);
echo "</br />";
echo "This is the table:</br />";
var_dump($table);
echo "</br />";
echo "</br />";
$data = $table->extract(array("tr", "td"));
echo "</br />";
echo "This is the data:</br />";
var_dump($data);
echo "</br />";
echo "</br />";
?>
和真正奇怪的部分:
当我像这样运行脚本时:
$extractor = new JS_Extractor(file_get_contents($connect)); // this is line 53
,我也得到了这片美丽:
*Warning*: file_get_contents(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://...@u="; var wa_referrer = "@r="; if(wa.WACID==null) { wa.WACID=wa.generateID('A'); wa.setCookie(wa.WACIDName,wa.WACID,wa.getTopDomain(wa.getDomain(document.URL))); } same = same + "@c=" + wa.WACID; if(screen) felbontas='@s='+screen.width+'x'+screen.height; if(document.referrer) wa_referrer=wa_referrer+document.referrer; if(document.URL) wa_url=wa_url+document.URL; same = same + felbontas + wa_url + wa_referrer; //--> </SCRIPT> </head> <body> <SCRIPT language="JavaScript"> <!-- document.write('<!-- Medián WebAudit HarmoNet Vegzetur 1/2 --><img style="position:absolute;top:-100px;left:-100px" src="http://audit.median.hu/cgi-bin/track.cgi?uc=12283086407878&dc=1&ui='+same+'" width="1" height="1">'); //--> </SCRIPT> <NOSCRIPT> <!-- M in */home2/tamtek/public_html/doomlord/gettable2.php* on line *53*
这很有趣,因为它从
之间的网页中删除了57行代码*Warning*: file_get_contents(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://
和
@u="; var wa_referrer = "@r=";
在第一行...... 然后再打印一些,然后忽略其余的......并在Line 53 上给我一个错误
我想,它会遇到一个特殊字符的解释,使页面的其余部分消失,但我不知道它可能是什么以及如何摆脱它......
有没有人对可能出错的地方有任何建议?
顺便说一下,你可以在这里找到JS Extractor ...... jacksleight.com/old/blog/2008/02/10/js-extractor-and-the-death-of-table-extractor
如果您想玩免费游戏并且真的想要帮助,您可以加入这里:
www.doomlord.net/?kar=147
您只需要一个电子邮件地址,而不需要其他任何内容
确保你选择钻石部落(否则你将无法加入我的战队并使其成为徒劳的练习)
加入我的战队“Girlfight”(这是你到达原始页面的唯一途径www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak
但即使没有参加,我想我在这里提供足够的数据,希望能看到什么是... ...
如果您认为自己可以提供帮助,但需要更多数据,请随时询问......
谢谢。
Thadson
答案 0 :(得分:1)
我没有尝试直接获取表数据,而是悲惨地失败,而是将文件写入临时文件,然后从那里拉出表数据......
请记住,我告诉过你脚本是在本地文件上工作的吗? 所以我把文件作为我的本地文件:-)
这是我做的:
<?php
session_start();
require_once('inc/constant.php');
require_once('inc/function.php'); //basic functions
db_on();
$ckfile = tempnam ("tmp", "cookie.tmp");
$fields_string='';
$fields = array(
'username'=>urlencode(SITE_USER),
'jelszo'=>urlencode(SITE_PASS),
'vilag'=>urlencode(SITE_WORLD),
'tev'=>urlencode(SITE_TEV),
);
foreach($fields as $key=>$value) { $fields_string .= $key.'='.$value.'&'; }
//rtrim($fields_string,'&');
$fields_string=substr($fields_string,0,-1);
//print_r($fields_string);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net');
curl_setopt($ch,CURLOPT_COOKIEJAR,$ckfile);
curl_setopt($ch,CURLOPT_POST,count($fields));
curl_setopt($ch,CURLOPT_POSTFIELDS,$fields_string);
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
$fp=fopen('temp.html','w'); //create my temp file here
fclose($fp);
curl_close($ch);
$ch=curl_init();
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt($ch,CURLOPT_URL,'http://www.doomlord.net/index.php?m=szovetseg&sub=reszletes_statisztikak');
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,20);
curl_setopt($ch,CURLOPT_COOKIEFILE,$ckfile);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,3);
curl_setopt($ch,CURLOPT_MAXREDIRS,10);
$connect=curl_exec($ch);
$response1=curl_getinfo( $ch );
// and this is the key here
$fp=fopen('temp.html','a');
fputs($fp, $connect);
fclose($fp);
curl_close($ch);
// now my the extractor works
set_include_path(get_include_path() . PATH_SEPARATOR . './library/');
require_once 'JS/Extractor.php';
$extractor = new JS_Extractor(file_get_contents('temp.html'));
$body = $extractor->query("body")->item(0);
$table = $body->query("//table[@class=\"rstatisztika_tabla\"]")->item(0);
$data = $table->extract(array("tr", "td"));
echo "</br />";
echo "This is the data:</br />";
var_dump($data);
echo "</br />";
echo "</br />";
?>