我的问题是它不仅需要基本的cookie,而是要求会话cookie和随机生成的ID。我想这意味着我需要使用带有cookie jar的Web浏览器模拟器?
我曾尝试使用Snoopy,Goutte和其他一些网络浏览器模拟器,但截至目前为止,我还未能找到有关如何接收Cookie的教程。我有点绝望了!
有人能给我一个如何在史努比或Goutte中接受cookie的例子吗?
提前致谢!
答案 0 :(得分:1)
我们在一个名为Browser
的类中尽可能实现previous answer,该类应该提供正常的导航功能。
然后我们应该能够以非常简单的形式将特定于站点的代码放在我们称之为FooBrowser
的新派生类中,该类执行站点Foo
的抓取。
类派生浏览器必须提供一些特定于站点的功能,例如path()
功能,允许存储特定于站点的信息,例如
function path($basename) {
return '/var/tmp/www.foo.bar/' . $basename;
}
abstract class Browser
{
private $options = [];
private $state = [];
protected $cookies;
abstract protected function path($basename);
public function __construct($site, $options = []) {
$this->cookies = $this->path('cookies');
$this->options = array_merge(
[
'site' => $site,
'userAgent' => 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 - LeoScraper',
'waitTime' => 250000,
],
$options
);
$this->state = [
'referer' => '/',
'url' => '',
'curl' => '',
];
$this->__wakeup();
}
/**
* Reactivates after sleep (e.g. in session) or creation
*/
public function __wakeup() {
$this->state['curl'] = curl_init();
$this->config([
CURLOPT_USERAGENT => $this->options['userAgent'],
CURLOPT_ENCODING => '',
CURLOPT_NOBODY => false,
// ...retrieving the body...
CURLOPT_BINARYTRANSFER => true,
// ...as binary...
CURLOPT_RETURNTRANSFER => true,
// ...into $ret...
CURLOPT_FOLLOWLOCATION => true,
// ...following redirections...
CURLOPT_MAXREDIRS => 5,
// ...reasonably...
CURLOPT_COOKIEFILE => $this->cookies,
// Save these cookies
CURLOPT_COOKIEJAR => $this->cookies,
// (already set above)
CURLOPT_CONNECTTIMEOUT => 30,
// Seconds
CURLOPT_TIMEOUT => 300,
// Seconds
CURLOPT_LOW_SPEED_LIMIT => 16384,
// 16 Kb/s
CURLOPT_LOW_SPEED_TIME => 15,
]);
}
/**
* Imports an options array.
*
* @param array $opts
* @throws DetailedError
*/
private function config(array $opts = []) {
foreach ($opts as $key => $value) {
if (true !== curl_setopt($this->state['curl'], $key, $value)) {
throw new \Exception('Could not set cURL option');
}
}
}
private function perform($url) {
$this->state['referer'] = $this->state['url'];
$this->state['url'] = $url;
$this->config([
CURLOPT_URL => $this->options['site'] . $this->state['url'],
CURLOPT_REFERER => $this->options['site'] . $this->state['referer'],
]);
$response = curl_exec($this->state['curl']);
// Should we ever want to randomize waitTime, do so here.
usleep($this->options['waitTime']);
return $response;
}
/**
* Returns a configuration option.
* @param string $key configuration key name
* @param string $value value to set
* @return mixed
*/
protected function option($key, $value = '__DEFAULT__') {
$curr = $this->options[$key];
if ('__DEFAULT__' !== $value) {
$this->options[$key] = $value;
}
return $curr;
}
/**
* Performs a POST.
*
* @param $url
* @param $fields
* @return mixed
*/
public function post($url, array $fields) {
$this->config([
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query($fields),
]);
return $this->perform($url);
}
/**
* Performs a GET.
*
* @param $url
* @param array $fields
* @return mixed
*/
public function get($url, array $fields = []) {
$this->config([ CURLOPT_POST => false ]);
if (empty($fields)) {
$query = '';
} else {
$query = '?' . http_build_query($fields);
}
return $this->perform($url . $query);
}
}
现在刮掉FooSite:
/* WWW_FOO_COM requires username and password to construct */
class WWW_FOO_COM_Browser extends Browser
{
private $loggedIn = false;
public function __construct($username, $password) {
parent::__construct('http://www.foo.bar.baz', [
'username' => $username,
'password' => $password,
'waitTime' => 250000,
'userAgent' => 'FooScraper',
'cache' => true
]);
// Open the session
$this->get('/');
// Navigate to the login page
$this->get('/login.do');
}
/**
* Perform login.
*/
public function login() {
$response = $this->post(
'/ajax/loginPerform',
[
'j_un' => $this->option('username'),
'j_pw' => $this->option('password'),
]
);
// TODO: verify that response is OK.
// if (!strstr($response, "Welcome " . $this->option('username'))
// throw new \Exception("Bad username or password")
$this->loggedIn = true;
return true;
}
public function scrape($entry) {
// We could implement caching to avoid scraping the same entry
// too often. Save $data into path("entry-" . md5($entry))
// and verify the filemtime of said file, is it newer than time()
// minus, say, 86400 seconds? If yes, return file_get_content and
// leave remote site alone.
$data = $this->get(
'/foobars/baz.do',
[
'ticker' => $entry
]
);
return $data;
}
现在实际的抓取代码是:
$scraper = new WWW_FOO_COM_Browser('lserni', 'mypassword');
if (!$scraper->login()) {
throw new \Exception("bad user or pass");
}
foreach ($entries as $entry) {
$html = $scraper->scrape($entry);
// Parse HTML
}