使用curl来提取数据我还有点新意,我最近开始使用Fiddler帮助找到需要设置的选项。
我正试图看看我是否可以从网站上提取图片。我首先点击搜索页面 - 我设置搜索参数,然后开始点击结果中的链接。当我尝试在图像的一个结果中找到一个链接时,我得到一个从curl_exec()返回的空字符串。
奇怪的是 - 在某一点上,它起作用了 - 我得到了数据并成功地在本地保存了图像。但后来它停了下来,我不知道我在做什么让它工作。当然,一切都在浏览器中正常运行。 :(
我正在使用Simple HTML DOM来解析结果,并使用cUrl来查看实际的页面请求。 curl_error()没有显示错误,curl_getinfo()认为一切正常。这可能是微不足道的,但我不确定如何解决它超出我的位置。
<?php
include 'includes/simple_html_dom.php';
$url = "http://nwweb.co.bell.tx.us/NewWorld.Aegis.WebPortal/Corrections/InmateInquiry.aspx";
// Get Cookie - ASP.NET_SessionId
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
$r = curl_exec($ch);
preg_match_all('/^Set-Cookie:\s*([^;]*)/mi', $r, $matches);
$cookies = array();
foreach($matches[1] as $item)
{
parse_str($item, $cookie);
$cookies = array_merge($cookies, $cookie);
}
$sessionCookie = "ASP_NET_SessionId=".$cookies['ASP_NET_SessionId'];
// now load up page into Simple HTML DOM and get all inputs - ignore buttons and populate our dates
$startDate = "02%2F01%2F2000";
$endDate = "02%2F07%2F2016";
$getInputs = str_get_html($r);
$inputs = $getInputs->find('input');
$inputs_array = array();
$buttons_array = array();
for ($i=0; $i<count($inputs); $i++)
{
if ($inputs[$i]->type != "submit")
{
$inputs_array[$inputs[$i]->id] = $inputs[$i]->value;
if (stripos($inputs[$i]->id, "FromDate") > 0)
$inputs_array[$inputs[$i]->id] = $startDate;
if (stripos($inputs[$i]->id, "ToDate") > 0)
$inputs_array[$inputs[$i]->id] = $endDate;
}
}
// build up our curl data - includes hidden inputs, our to & from dates, plus the Search button
$curl_data = http_build_query($inputs_array)."&ctl00%24DefaultContent%24uxSearch=Search";
// POST the data, include session cookie
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $curl_data);
curl_setopt($ch, CURLOPT_COOKIE, $sessionCookie);
$response = curl_exec($ch);
// this shows that we can get data
// find the links from the HTML
$htmlDom = str_get_html($response); // load up Simple HTML DOM
// get the table of results
$divTable = $htmlDom->find('div#ctl00_DefaultContent_uxResultsWrapper',0)->find('table',0);
$rows = $divTable->find('tr');
for ($i=1; $i<count($rows);$i++)
{
if ($i>3) break; // limit the length of script for debugging
$link = $rows[$i]->find('td',1)->find('a',0)->href;
// build up query to get inmate details from the link above
$url = "http://nwweb.co.bell.tx.us/NewWorld.Aegis.WebPortal/Corrections/".$link;
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_COOKIE, $sessionCookie);
$page = curl_exec($ch);
$pageData = str_get_html($page);
// Now find the Photo, there's a thumb in div.BookingPhotos
// It is linked to a full size image, the link is of the form http://nwweb.co.bell.tx.us/NewWorld.Aegis.WebPortal/GetImage.aspx?ImageKey=17C030IS, but in the href, it has ../GetImage.aspx?ImageKey=xxxx
$photoLink = $pageData->find('div.BookingPhotos',0)->find('a',0)->href;
// get rid of .. and put the base URL on the front
$imgLink = str_replace("..", "http://nwweb.co.bell.tx.us/NewWorld.Aegis.WebPortal", $photoLink);
// now attempt to pull the image
$ch = curl_init($imgLink);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_COOKIE, $sessionCookie);
// here is the PROBLEM - NO DATA RETURNED
$imgData = curl_exec($ch); // I get a header back, but NO data
}
?>